antiagainst/conv-conversion.mlir

## conv-conversion.mlir
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    return %0 : tensor<1x112x112x32xf32>
  }
}

// -----// IR Dump After Canonicalizer //----- //
func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
  %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  return %0 : tensor<1x112x112x32xf32>
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Inliner //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After SymbolDCE //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After TopLevelSCFToCFG //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ShapeToShapeLowering //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ConvertShapeToStandard //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Inliner //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After LegalizeInputTypes //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %cst = constant 0.000000e+00 : f32
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = constant 0.000000e+00 : f32
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
    %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
    %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
}

// -----// IR Dump After IREEImportPublic //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = constant 0.000000e+00 : f32
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
    %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
    %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
}

// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After VerifyInputLegality //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After ExpandGlobalDynamicDims //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = constant 0.000000e+00 : f32
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
    %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
    %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionDynamicDimsPass //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = constant 0.000000e+00 : f32
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
    %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
    %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
}

// -----// IR Dump After PadTensorToSubTensorInsert //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = constant 0.000000e+00 : f32
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
    %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
    %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
}

// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After InterchangeGenericOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After FusionOfTensorOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = constant 0.000000e+00 : f32
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
    %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
    %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
}

// -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = constant 0.000000e+00 : f32
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
  %4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  %5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %5 : !hal.buffer_view
}

// -----// IR Dump After DispatchLinalgOnTensors //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
    %cst = constant 0.000000e+00 : f32
    %c112_0 = constant 112 : index
    %c32_1 = constant 32 : index
    %4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
    %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
    %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
    %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
    %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
    %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
    %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
    %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
    %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
    %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
    %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
    scf.for %arg5 = %5 to %c112_0 step %6 {
      %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
      %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
      scf.for %arg6 = %7 to %c112_0 step %8 {
        %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
        %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
        scf.for %arg7 = %9 to %c32_1 step %10 {
          %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
          %12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
          %13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
          %14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
          %15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
          %16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
          %17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
          %18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
          %19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
          %20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
          %21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
          %22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
          %23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
          %24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
          %25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
          %26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
          flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
        }
      }
    }
    flow.return
  }
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module  {
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
      %cst = constant 0.000000e+00 : f32
      %c112_0 = constant 112 : index
      %c32_1 = constant 32 : index
      %4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
      %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
      %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
      %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
      %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
      %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
      %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
      %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
      %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
      %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
      %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
      %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
      scf.for %arg5 = %5 to %c112_0 step %6 {
        %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
        %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
        scf.for %arg6 = %7 to %c112_0 step %8 {
          %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
          %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
          scf.for %arg7 = %9 to %c32_1 step %10 {
            %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
            %12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
            %13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
            %14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
            %15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
            %16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
            %17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
            %18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
            %19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
            %20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
            %21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
            %22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
            %23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
            %24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
            %25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
            %26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
            flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
          }
        }
      }
      flow.return
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
    %cst = constant 0.000000e+00 : f32
    %c112_0 = constant 112 : index
    %c32_1 = constant 32 : index
    %4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
    %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
    %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
    %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
    %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
    %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
    %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
    %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
    %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
    %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
    %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
    scf.for %arg5 = %5 to %c112_0 step %6 {
      %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
      %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
      scf.for %arg6 = %7 to %c112_0 step %8 {
        %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
        %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
        scf.for %arg7 = %9 to %c32_1 step %10 {
          %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
          %12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
          %13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
          %14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
          %15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
          %16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
          %17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
          %18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
          %19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
          %20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
          %21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
          %22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
          %23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
          %24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
          %25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
          %26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
          flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
        }
      }
    }
    flow.return
  }
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
    %c32_0 = constant 32 : index
    %c112_1 = constant 112 : index
    %cst = constant 0.000000e+00 : f32
    %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
    %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
    %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
    %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
    %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
    %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
    %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
    %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
    %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
    %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
    %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
    scf.for %arg5 = %4 to %c112_1 step %5 {
      %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
      %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
      scf.for %arg6 = %6 to %c112_1 step %7 {
        %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
        %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
        scf.for %arg7 = %8 to %c32_0 step %9 {
          %10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
          %11 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
          %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
          %13 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
          %14 = flow.dispatch.tensor.load %arg2, offsets = [0, %10, %12, 0], sizes = [1, %11, %13, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
          %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
          %16 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %15], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
          %17 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
          %18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
          %19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
          %20 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
          %21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
          %22 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
          %23 = linalg.init_tensor [1, %20, %21, %22] : tensor<1x?x?x?xf32>
          %24 = linalg.fill(%cst, %23) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
          %25 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%14, %16 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%24 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
          flow.dispatch.tensor.store %25, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %17, %18, %19], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
        }
      }
    }
    flow.return
  }
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After OutlineDispatchRegions //----- //
module  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %cst = constant 0.000000e+00 : f32
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
              %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
              %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
              %17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
              %18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After DeduplicateExecutables //----- //
module  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %cst = constant 0.000000e+00 : f32
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
              %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
              %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
              %17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
              %18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After HoistUnstreamableOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After InsertConstantClones //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After FormStreams //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
    %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    flow.return %4 : tensor<1x112x112x32xf32>
  }
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After OutlineLargeConstants //----- //
module  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %cst = constant 0.000000e+00 : f32
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
              %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
              %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
              %17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
              %18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    flow.return %4 : tensor<1x112x112x32xf32>
  }
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
  %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
  %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    flow.return %4 : tensor<1x112x112x32xf32>
  }
  %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
}

// -----// IR Dump After SymbolDCE //----- //
module  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %cst = constant 0.000000e+00 : f32
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
              %14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
              %15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
              %16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
              %17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
              %18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c32 = constant 32 : index
      %c112 = constant 112 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
module  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %cst = constant 0.000000e+00 : f32
        %c112 = constant 112 : index
        %c32 = constant 32 : index
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
              %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
              %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %cst = constant 0.000000e+00 : f32
        %c112 = constant 112 : index
        %c32 = constant 32 : index
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
              %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
              %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %cst = constant 0.000000e+00 : f32
        %c112 = constant 112 : index
        %c32 = constant 32 : index
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
              %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
              %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::IdentifyConstantPoolsPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %cst = constant 0.000000e+00 : f32
        %c112 = constant 112 : index
        %c32 = constant 32 : index
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
              %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
              %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeConstantPoolBuffersPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %cst = constant 0.000000e+00 : f32
        %c112 = constant 112 : index
        %c32 = constant 32 : index
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
              %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
              %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %cst = constant 0.000000e+00 : f32
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
              %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
              %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c32 = constant 32 : index
      %c112 = constant 112 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After SymbolDCE //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  flow.executable private @conv_dispatch_0 {
    flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
    builtin.module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %cst = constant 0.000000e+00 : f32
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
              %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
              %9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
              %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
              %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
              %17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
              %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
              %20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
              %21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c32 = constant 32 : index
      %c112 = constant 112 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
      builtin.module  {
        func @conv_dispatch_0() {
          %c0 = constant 0 : index
          %c32 = constant 32 : index
          %c112 = constant 112 : index
          %cst = constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
          %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
          %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
          %workgroup_size_x = hal.interface.workgroup.size[0] : index
          %workgroup_size_y = hal.interface.workgroup.size[1] : index
          %workgroup_size_z = hal.interface.workgroup.size[2] : index
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %workgroup_id_z = hal.interface.workgroup.id[2] : index
          %workgroup_count_z = hal.interface.workgroup.count[2] : index
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
          %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
          scf.for %arg0 = %3 to %c112 step %4 {
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
            %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
            scf.for %arg1 = %5 to %c112 step %6 {
              %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
              %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
              scf.for %arg2 = %7 to %c32 step %8 {
                %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
                %10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
                %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
                %12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
                %13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
                %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
                %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
                %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
                %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
                %18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
                %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
                %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
                %21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
                %22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
                %23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
                %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
                flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
              }
            }
          }
          return
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c32 = constant 32 : index
      %c112 = constant 112 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
      builtin.module  {
        func @conv_dispatch_0() {
          %cst = constant 0.000000e+00 : f32
          %c112 = constant 112 : index
          %c32 = constant 32 : index
          %c0 = constant 0 : index
          %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
          %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
          %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
          %workgroup_size_x = hal.interface.workgroup.size[0] : index
          %workgroup_size_y = hal.interface.workgroup.size[1] : index
          %workgroup_size_z = hal.interface.workgroup.size[2] : index
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %workgroup_id_z = hal.interface.workgroup.id[2] : index
          %workgroup_count_z = hal.interface.workgroup.count[2] : index
          %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
          %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
          scf.for %arg0 = %3 to %c112 step %4 {
            %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
            %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
            scf.for %arg1 = %5 to %c112 step %6 {
              %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
              %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
              scf.for %arg2 = %7 to %c32 step %8 {
                %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
                %10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
                %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
                %12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
                %13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
                %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
                %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
                %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
                %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
                %18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
                %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
                %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
                %21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
                %22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
                %23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
                %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
                flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
              }
            }
          }
          return
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
  hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
  builtin.module  {
    func @conv_dispatch_0() {
      %cst = constant 0.000000e+00 : f32
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %c0 = constant 0 : index
      %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
      %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
      %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
      %workgroup_size_x = hal.interface.workgroup.size[0] : index
      %workgroup_size_y = hal.interface.workgroup.size[1] : index
      %workgroup_size_z = hal.interface.workgroup.size[2] : index
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_count_x = hal.interface.workgroup.count[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_count_y = hal.interface.workgroup.count[1] : index
      %workgroup_id_z = hal.interface.workgroup.id[2] : index
      %workgroup_count_z = hal.interface.workgroup.count[2] : index
      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
      scf.for %arg0 = %3 to %c112 step %4 {
        %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
        %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
        scf.for %arg1 = %5 to %c112 step %6 {
          %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
          %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
          scf.for %arg2 = %7 to %c32 step %8 {
            %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
            %10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
            %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
            %12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
            %13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
            %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
            %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
            %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
            %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
            %18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
            %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
            %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
            %21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
            %22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
            %23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
            %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
            flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
          }
        }
      }
      return
    }
    hal.interface private @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}

// -----// IR Dump After LinalgBufferize //----- //
func @conv_dispatch_0() {
  %cst = constant 0.000000e+00 : f32
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
  %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
  %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %6 to %c112 step %7 {
    %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %8 to %c112 step %9 {
      %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %10 to %c32 step %11 {
        %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
        %14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
        %16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %17 = flow.dispatch.tensor.load %1, offsets = [0, %12, %14, 0], sizes = [1, %13, %15, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
        %18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %19 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %18] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %20 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %18], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
        %21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %22 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %23 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %24 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
        %25 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
        %26 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
        %27 = linalg.init_tensor [1, %24, %25, %26] : tensor<1x?x?x?xf32>
        %28 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %29 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %30 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %31 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %28, %29, %30] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%cst, %31) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        %32 = linalg.fill(%cst, %27) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
        linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %19 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%31 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
        %33 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%17, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%32 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
      }
    }
  }
  return
}

// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant 0.000000e+00 : f32
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
    %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
    %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
    %workgroup_size_x = hal.interface.workgroup.size[0] : index
    %workgroup_size_y = hal.interface.workgroup.size[1] : index
    %workgroup_size_z = hal.interface.workgroup.size[2] : index
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_count_z = hal.interface.workgroup.count[2] : index
    %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
    scf.for %arg0 = %6 to %c112 step %7 {
      %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
      %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
      scf.for %arg1 = %8 to %c112 step %9 {
        %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
        %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
        scf.for %arg2 = %10 to %c32 step %11 {
          %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
          %13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
          %14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
          %15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
          %16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
          %18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
          %20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
          %21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
          %22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
          linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
          linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
        }
      }
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %cst = constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
  %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
  %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %6 to %c112 step %7 {
    %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %8 to %c112 step %9 {
      %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %10 to %c32 step %11 {
        %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
        %14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
        %16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

// -----// IR Dump After CSE //----- //
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %cst = constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
  %2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
  %4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %6 to %c112 step %7 {
    %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %8 to %c112 step %9 {
      %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %10 to %c32 step %11 {
        %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
        %14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
        %16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %21 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %17] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%cst, %21) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%21 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

// -----// IR Dump After CleanupBufferAllocView //----- //
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %cst = constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %3 to %c112 step %4 {
    %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %5 to %c112 step %6 {
      %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %7 to %c32 step %8 {
        %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
        %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
        %13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%cst, %18) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

// -----// IR Dump After SetNumWorkgroups //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
  hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
    %c112 = constant 112 : index
    %c14 = constant 14 : index
    %c1 = constant 1 : index
    hal.return %c1, %c14, %c112 : index, index, index
  }
  builtin.module  {
    func @conv_dispatch_0() {
      %c0 = constant 0 : index
      %c32 = constant 32 : index
      %c112 = constant 112 : index
      %cst = constant 0.000000e+00 : f32
      %c8 = constant 8 : index
      %c1 = constant 1 : index
      %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
      %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
      %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_count_x = hal.interface.workgroup.count[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_count_y = hal.interface.workgroup.count[1] : index
      %workgroup_id_z = hal.interface.workgroup.id[2] : index
      %workgroup_count_z = hal.interface.workgroup.count[2] : index
      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c1]
      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %c1]
      scf.for %arg0 = %3 to %c112 step %4 {
        %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c8]
        %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c8]
        scf.for %arg1 = %5 to %c112 step %6 {
          %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c32]
          %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c32]
          scf.for %arg2 = %7 to %c32 step %8 {
            %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
            %10 = affine.min affine_map<(d0)[s0] -> (3, d0 * -2 + 227)>(%arg0)[%c1]
            %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
            %12 = affine.min affine_map<(d0)[s0] -> (17, d0 * -2 + 227)>(%arg1)[%c8]
            %13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
            %14 = affine.min affine_map<(d0)[s0] -> (32, -d0 + 32)>(%arg2)[%c32]
            %15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
            %16 = affine.min affine_map<(d0)[s0] -> (1, -d0 + 112)>(%arg0)[%c1]
            %17 = affine.min affine_map<(d0)[s0] -> (8, -d0 + 112)>(%arg1)[%c8]
            %18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
            linalg.fill(%cst, %18) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
            linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
          }
        }
      }
      return
    }
    hal.interface private @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}

// -----// IR Dump After Canonicalizer //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
  hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
    %c1 = constant 1 : index
    %c14 = constant 14 : index
    %c112 = constant 112 : index
    hal.return %c1, %c14, %c112 : index, index, index
  }
  builtin.module  {
    func @conv_dispatch_0() {
      %cst = constant 0.000000e+00 : f32
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %c0 = constant 0 : index
      %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
      %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
      %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_count_x = hal.interface.workgroup.count[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_count_y = hal.interface.workgroup.count[1] : index
      %workgroup_id_z = hal.interface.workgroup.id[2] : index
      %workgroup_count_z = hal.interface.workgroup.count[2] : index
      scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
        %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
        %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
        scf.for %arg1 = %3 to %c112 step %4 {
          %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
          %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
          scf.for %arg2 = %5 to %c32 step %6 {
            %7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
            %8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
            %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
            %10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
            %11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
            %12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
            %13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
            %14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
            %15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
            %16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
            linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
            linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
          }
        }
      }
      return
    }
    hal.interface private @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}

// -----// IR Dump After Canonicalizer //----- //
module  {
  func @conv_dispatch_0() {
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %cst = constant 0.000000e+00 : f32
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_count_z = hal.interface.workgroup.count[2] : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
      %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
      %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
      scf.for %arg1 = %3 to %c112 step %4 {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
        %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
        scf.for %arg2 = %5 to %c32 step %6 {
          %7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
          %8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
          %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
          %10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
          %11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
          %13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
          %15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
          %16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
          linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
          linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
        }
      }
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After CSE //----- //
module  {
  func @conv_dispatch_0() {
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %cst = constant 0.000000e+00 : f32
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %workgroup_count_z = hal.interface.workgroup.count[2] : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
      %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
      %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
      scf.for %arg1 = %3 to %c112 step %4 {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
        %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
        scf.for %arg2 = %5 to %c32 step %6 {
          %7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
          %8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
          %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
          %10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
          %11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
          %13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
          %15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
          %16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
          linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
          linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
        }
      }
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After SPIRVRemoveOneTripTiledLoop //----- //
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %c112 = constant 112 : index
  %cst = constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
    %5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
    %6 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
    %7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%3)
    %8 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%3)
    %9 = memref.subview %0[0, %5, %7, 0] [1, %6, %8, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %10 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%4)
    %11 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, %10] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %12 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
    %13 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%3)
    %14 = memref.subview %2[0, %arg0, %3, %4] [1, %12, %13, %10] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    linalg.fill(%cst, %14) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%9, %11 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%14 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
  }
  return
}

// -----// IR Dump After SPIRVTileAndDistribute //----- //
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %c112 = constant 112 : index
  %cst = constant 0.000000e+00 : f32
  %c3 = constant 3 : index
  %c1 = constant 1 : index
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
    %5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
    %6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
    %7 = memref.subview %0[0, %5, %6, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %8 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %9 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %10 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %11 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
    %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%10]
    %15 = memref.subview %9[0, %12, %13, %14] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    linalg.fill(%cst, %15) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %16 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %17 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %18 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
    %20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16]
    %21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
    %22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17]
    %23 = memref.subview %7[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %24 = memref.subview %8[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %25 = memref.subview %9[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    scf.for %arg1 = %c0 to %c3 step %c1 {
      scf.for %arg2 = %c0 to %c3 step %c1 {
        %26 = memref.subview %23[0, %arg1, %arg2, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %27 = memref.subview %24[%arg1, %arg2, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "vectorize", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%26, %27 : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%25 : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

// -----// IR Dump After SPIRVVectorize //----- //
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %c112 = constant 112 : index
  %c3 = constant 3 : index
  %c1 = constant 1 : index
  %cst = constant dense<0.000000e+00> : vector<1x1x4x4xf32>
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst_0 = constant 0.000000e+00 : f32
  %cst_1 = constant dense<0.000000e+00> : vector<1x4xf32>
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
  %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
  %11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
  %12 = vector.extract_strided_slice %cst {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
  %13 = vector.extract_strided_slice %cst {offsets = [0, 0, 1, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
  %14 = vector.extract_strided_slice %cst {offsets = [0, 0, 2, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
  %15 = vector.extract_strided_slice %cst {offsets = [0, 0, 3, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
  %16 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %17 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %18 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
  %20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16]
  %21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
  %22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17]
  %23 = memref.subview %6[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %24 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
    %25 = memref.subview %0[0, %24, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %26 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %27 = memref.subview %26[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %12, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %13, %27[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %14, %27[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %15, %27[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %28 = memref.subview %25[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %29 = memref.subview %26[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %30 = vector.transfer_read %29[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %31 = vector.transfer_read %29[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %32 = vector.transfer_read %29[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %33 = vector.transfer_read %29[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %34:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %30, %arg3 = %31, %arg4 = %32, %arg5 = %33) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %35:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %36 = memref.subview %28[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %37 = memref.subview %23[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %38 = vector.transfer_read %37[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %39 = vector.transfer_read %37[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %40 = vector.transfer_read %37[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %41 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %42 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %43 = vector.extract %38[0] : vector<1x4xf32>
        %44 = vector.extract %42[0, 0] : vector<1x1xf32>
        %45 = splat %44 : vector<4xf32>
        %46 = vector.extract %arg7[0] : vector<1x4xf32>
        %47 = vector.fma %45, %43, %46 : vector<4xf32>
        %48 = vector.extract_strided_slice %41 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %49 = vector.extract %39[0] : vector<1x4xf32>
        %50 = vector.extract %48[0, 0] : vector<1x1xf32>
        %51 = splat %50 : vector<4xf32>
        %52 = vector.fma %51, %49, %47 : vector<4xf32>
        %53 = vector.extract_strided_slice %41 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %54 = vector.extract %40[0] : vector<1x4xf32>
        %55 = vector.extract %53[0, 0] : vector<1x1xf32>
        %56 = splat %55 : vector<4xf32>
        %57 = vector.fma %56, %54, %52 : vector<4xf32>
        %58 = vector.insert %57, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
        %59 = vector.transfer_read %36[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %60 = vector.extract_strided_slice %59 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %61 = vector.extract %38[0] : vector<1x4xf32>
        %62 = vector.extract %60[0, 0] : vector<1x1xf32>
        %63 = splat %62 : vector<4xf32>
        %64 = vector.extract %arg8[0] : vector<1x4xf32>
        %65 = vector.fma %63, %61, %64 : vector<4xf32>
        %66 = vector.extract_strided_slice %59 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %67 = vector.extract %39[0] : vector<1x4xf32>
        %68 = vector.extract %66[0, 0] : vector<1x1xf32>
        %69 = splat %68 : vector<4xf32>
        %70 = vector.fma %69, %67, %65 : vector<4xf32>
        %71 = vector.extract_strided_slice %59 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %72 = vector.extract %40[0] : vector<1x4xf32>
        %73 = vector.extract %71[0, 0] : vector<1x1xf32>
        %74 = splat %73 : vector<4xf32>
        %75 = vector.fma %74, %72, %70 : vector<4xf32>
        %76 = vector.insert %75, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
        %77 = vector.transfer_read %36[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %78 = vector.extract_strided_slice %77 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %79 = vector.extract %38[0] : vector<1x4xf32>
        %80 = vector.extract %78[0, 0] : vector<1x1xf32>
        %81 = splat %80 : vector<4xf32>
        %82 = vector.extract %arg9[0] : vector<1x4xf32>
        %83 = vector.fma %81, %79, %82 : vector<4xf32>
        %84 = vector.extract_strided_slice %77 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %85 = vector.extract %39[0] : vector<1x4xf32>
        %86 = vector.extract %84[0, 0] : vector<1x1xf32>
        %87 = splat %86 : vector<4xf32>
        %88 = vector.fma %87, %85, %83 : vector<4xf32>
        %89 = vector.extract_strided_slice %77 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %90 = vector.extract %40[0] : vector<1x4xf32>
        %91 = vector.extract %89[0, 0] : vector<1x1xf32>
        %92 = splat %91 : vector<4xf32>
        %93 = vector.fma %92, %90, %88 : vector<4xf32>
        %94 = vector.insert %93, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
        %95 = vector.transfer_read %36[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %96 = vector.extract_strided_slice %95 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %97 = vector.extract %38[0] : vector<1x4xf32>
        %98 = vector.extract %96[0, 0] : vector<1x1xf32>
        %99 = splat %98 : vector<4xf32>
        %100 = vector.extract %arg10[0] : vector<1x4xf32>
        %101 = vector.fma %99, %97, %100 : vector<4xf32>
        %102 = vector.extract_strided_slice %95 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %103 = vector.extract %39[0] : vector<1x4xf32>
        %104 = vector.extract %102[0, 0] : vector<1x1xf32>
        %105 = splat %104 : vector<4xf32>
        %106 = vector.fma %105, %103, %101 : vector<4xf32>
        %107 = vector.extract_strided_slice %95 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %108 = vector.extract %40[0] : vector<1x4xf32>
        %109 = vector.extract %107[0, 0] : vector<1x1xf32>
        %110 = splat %109 : vector<4xf32>
        %111 = vector.fma %110, %108, %106 : vector<4xf32>
        %112 = vector.insert %111, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
        scf.yield %58, %76, %94, %112 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %35#0, %35#1, %35#2, %35#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %34#3, %29[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %34#2, %29[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %34#1, %29[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %34#0, %29[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  }
  return
}

// -----// IR Dump After Canonicalizer //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %cst_0 = constant 0.000000e+00 : f32
    %c6 = constant 6 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
    %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
    %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
    %11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
    %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %13 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %14 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
    %16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
    %17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
    %18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
    %19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
      %21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %40 = vector.extract %38[0, 0] : vector<1x1xf32>
          %41 = splat %40 : vector<4xf32>
          %42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %43 = vector.fma %41, %39, %42 : vector<4xf32>
          %44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %46 = vector.extract %44[0, 0] : vector<1x1xf32>
          %47 = splat %46 : vector<4xf32>
          %48 = vector.fma %47, %45, %43 : vector<4xf32>
          %49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %51 = vector.extract %49[0, 0] : vector<1x1xf32>
          %52 = splat %51 : vector<4xf32>
          %53 = vector.fma %52, %50, %48 : vector<4xf32>
          %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
          %55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %58 = vector.extract %56[0, 0] : vector<1x1xf32>
          %59 = splat %58 : vector<4xf32>
          %60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %61 = vector.fma %59, %57, %60 : vector<4xf32>
          %62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %64 = vector.extract %62[0, 0] : vector<1x1xf32>
          %65 = splat %64 : vector<4xf32>
          %66 = vector.fma %65, %63, %61 : vector<4xf32>
          %67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %69 = vector.extract %67[0, 0] : vector<1x1xf32>
          %70 = splat %69 : vector<4xf32>
          %71 = vector.fma %70, %68, %66 : vector<4xf32>
          %72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
          %73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %76 = vector.extract %74[0, 0] : vector<1x1xf32>
          %77 = splat %76 : vector<4xf32>
          %78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %79 = vector.fma %77, %75, %78 : vector<4xf32>
          %80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %82 = vector.extract %80[0, 0] : vector<1x1xf32>
          %83 = splat %82 : vector<4xf32>
          %84 = vector.fma %83, %81, %79 : vector<4xf32>
          %85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %87 = vector.extract %85[0, 0] : vector<1x1xf32>
          %88 = splat %87 : vector<4xf32>
          %89 = vector.fma %88, %86, %84 : vector<4xf32>
          %90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
          %91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %94 = vector.extract %92[0, 0] : vector<1x1xf32>
          %95 = splat %94 : vector<4xf32>
          %96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %97 = vector.fma %95, %93, %96 : vector<4xf32>
          %98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %100 = vector.extract %98[0, 0] : vector<1x1xf32>
          %101 = splat %100 : vector<4xf32>
          %102 = vector.fma %101, %99, %97 : vector<4xf32>
          %103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %105 = vector.extract %103[0, 0] : vector<1x1xf32>
          %106 = splat %105 : vector<4xf32>
          %107 = vector.fma %106, %104, %102 : vector<4xf32>
          %108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
          scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After SPIRVCopyToWorkgroupMemory //----- //
func @conv_dispatch_0() {
  %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
  %cst_0 = constant 0.000000e+00 : f32
  %c6 = constant 6 : index
  %c2 = constant 2 : index
  %c4 = constant 4 : index
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c112 = constant 112 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
  %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
  %11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
  %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %14 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
  %17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
  %18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
  %19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
    %21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %40 = vector.extract %38[0, 0] : vector<1x1xf32>
        %41 = splat %40 : vector<4xf32>
        %42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %43 = vector.fma %41, %39, %42 : vector<4xf32>
        %44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %46 = vector.extract %44[0, 0] : vector<1x1xf32>
        %47 = splat %46 : vector<4xf32>
        %48 = vector.fma %47, %45, %43 : vector<4xf32>
        %49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %51 = vector.extract %49[0, 0] : vector<1x1xf32>
        %52 = splat %51 : vector<4xf32>
        %53 = vector.fma %52, %50, %48 : vector<4xf32>
        %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
        %55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %58 = vector.extract %56[0, 0] : vector<1x1xf32>
        %59 = splat %58 : vector<4xf32>
        %60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %61 = vector.fma %59, %57, %60 : vector<4xf32>
        %62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %64 = vector.extract %62[0, 0] : vector<1x1xf32>
        %65 = splat %64 : vector<4xf32>
        %66 = vector.fma %65, %63, %61 : vector<4xf32>
        %67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %69 = vector.extract %67[0, 0] : vector<1x1xf32>
        %70 = splat %69 : vector<4xf32>
        %71 = vector.fma %70, %68, %66 : vector<4xf32>
        %72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
        %73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %76 = vector.extract %74[0, 0] : vector<1x1xf32>
        %77 = splat %76 : vector<4xf32>
        %78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %79 = vector.fma %77, %75, %78 : vector<4xf32>
        %80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %82 = vector.extract %80[0, 0] : vector<1x1xf32>
        %83 = splat %82 : vector<4xf32>
        %84 = vector.fma %83, %81, %79 : vector<4xf32>
        %85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %87 = vector.extract %85[0, 0] : vector<1x1xf32>
        %88 = splat %87 : vector<4xf32>
        %89 = vector.fma %88, %86, %84 : vector<4xf32>
        %90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
        %91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %94 = vector.extract %92[0, 0] : vector<1x1xf32>
        %95 = splat %94 : vector<4xf32>
        %96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
        %97 = vector.fma %95, %93, %96 : vector<4xf32>
        %98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %100 = vector.extract %98[0, 0] : vector<1x1xf32>
        %101 = splat %100 : vector<4xf32>
        %102 = vector.fma %101, %99, %97 : vector<4xf32>
        %103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %105 = vector.extract %103[0, 0] : vector<1x1xf32>
        %106 = splat %105 : vector<4xf32>
        %107 = vector.fma %106, %104, %102 : vector<4xf32>
        %108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
        scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  }
  return
}

// -----// IR Dump After LinalgExtToLoops //----- //
func @conv_dispatch_0() {
  %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
  %cst_0 = constant 0.000000e+00 : f32
  %c6 = constant 6 : index
  %c2 = constant 2 : index
  %c4 = constant 4 : index
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c112 = constant 112 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
  %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
  %11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
  %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %14 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
  %17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
  %18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
  %19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
    %21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %40 = vector.extract %38[0, 0] : vector<1x1xf32>
        %41 = splat %40 : vector<4xf32>
        %42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %43 = vector.fma %41, %39, %42 : vector<4xf32>
        %44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %46 = vector.extract %44[0, 0] : vector<1x1xf32>
        %47 = splat %46 : vector<4xf32>
        %48 = vector.fma %47, %45, %43 : vector<4xf32>
        %49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %51 = vector.extract %49[0, 0] : vector<1x1xf32>
        %52 = splat %51 : vector<4xf32>
        %53 = vector.fma %52, %50, %48 : vector<4xf32>
        %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
        %55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %58 = vector.extract %56[0, 0] : vector<1x1xf32>
        %59 = splat %58 : vector<4xf32>
        %60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %61 = vector.fma %59, %57, %60 : vector<4xf32>
        %62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %64 = vector.extract %62[0, 0] : vector<1x1xf32>
        %65 = splat %64 : vector<4xf32>
        %66 = vector.fma %65, %63, %61 : vector<4xf32>
        %67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %69 = vector.extract %67[0, 0] : vector<1x1xf32>
        %70 = splat %69 : vector<4xf32>
        %71 = vector.fma %70, %68, %66 : vector<4xf32>
        %72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
        %73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %76 = vector.extract %74[0, 0] : vector<1x1xf32>
        %77 = splat %76 : vector<4xf32>
        %78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %79 = vector.fma %77, %75, %78 : vector<4xf32>
        %80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %82 = vector.extract %80[0, 0] : vector<1x1xf32>
        %83 = splat %82 : vector<4xf32>
        %84 = vector.fma %83, %81, %79 : vector<4xf32>
        %85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %87 = vector.extract %85[0, 0] : vector<1x1xf32>
        %88 = splat %87 : vector<4xf32>
        %89 = vector.fma %88, %86, %84 : vector<4xf32>
        %90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
        %91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %94 = vector.extract %92[0, 0] : vector<1x1xf32>
        %95 = splat %94 : vector<4xf32>
        %96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
        %97 = vector.fma %95, %93, %96 : vector<4xf32>
        %98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %100 = vector.extract %98[0, 0] : vector<1x1xf32>
        %101 = splat %100 : vector<4xf32>
        %102 = vector.fma %101, %99, %97 : vector<4xf32>
        %103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %105 = vector.extract %103[0, 0] : vector<1x1xf32>
        %106 = splat %105 : vector<4xf32>
        %107 = vector.fma %106, %104, %102 : vector<4xf32>
        %108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
        scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  }
  return
}

// -----// IR Dump After LinalgLowerToLoops //----- //
func @conv_dispatch_0() {
  %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
  %cst_0 = constant 0.000000e+00 : f32
  %c6 = constant 6 : index
  %c2 = constant 2 : index
  %c4 = constant 4 : index
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c112 = constant 112 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
  %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
  %11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
  %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %14 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
  %17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
  %18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
  %19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
    %21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %40 = vector.extract %38[0, 0] : vector<1x1xf32>
        %41 = splat %40 : vector<4xf32>
        %42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %43 = vector.fma %41, %39, %42 : vector<4xf32>
        %44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %46 = vector.extract %44[0, 0] : vector<1x1xf32>
        %47 = splat %46 : vector<4xf32>
        %48 = vector.fma %47, %45, %43 : vector<4xf32>
        %49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %51 = vector.extract %49[0, 0] : vector<1x1xf32>
        %52 = splat %51 : vector<4xf32>
        %53 = vector.fma %52, %50, %48 : vector<4xf32>
        %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
        %55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %58 = vector.extract %56[0, 0] : vector<1x1xf32>
        %59 = splat %58 : vector<4xf32>
        %60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %61 = vector.fma %59, %57, %60 : vector<4xf32>
        %62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %64 = vector.extract %62[0, 0] : vector<1x1xf32>
        %65 = splat %64 : vector<4xf32>
        %66 = vector.fma %65, %63, %61 : vector<4xf32>
        %67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %69 = vector.extract %67[0, 0] : vector<1x1xf32>
        %70 = splat %69 : vector<4xf32>
        %71 = vector.fma %70, %68, %66 : vector<4xf32>
        %72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
        %73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %76 = vector.extract %74[0, 0] : vector<1x1xf32>
        %77 = splat %76 : vector<4xf32>
        %78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %79 = vector.fma %77, %75, %78 : vector<4xf32>
        %80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %82 = vector.extract %80[0, 0] : vector<1x1xf32>
        %83 = splat %82 : vector<4xf32>
        %84 = vector.fma %83, %81, %79 : vector<4xf32>
        %85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %87 = vector.extract %85[0, 0] : vector<1x1xf32>
        %88 = splat %87 : vector<4xf32>
        %89 = vector.fma %88, %86, %84 : vector<4xf32>
        %90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
        %91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
        %94 = vector.extract %92[0, 0] : vector<1x1xf32>
        %95 = splat %94 : vector<4xf32>
        %96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
        %97 = vector.fma %95, %93, %96 : vector<4xf32>
        %98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
        %100 = vector.extract %98[0, 0] : vector<1x1xf32>
        %101 = splat %100 : vector<4xf32>
        %102 = vector.fma %101, %99, %97 : vector<4xf32>
        %103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %105 = vector.extract %103[0, 0] : vector<1x1xf32>
        %106 = splat %105 : vector<4xf32>
        %107 = vector.fma %106, %104, %102 : vector<4xf32>
        %108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
        scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  }
  return
}

// -----// IR Dump After ConvertAffineToStandard //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %cst_0 = constant 0.000000e+00 : f32
    %c6 = constant 6 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %c8 = constant 8 : index
    %3 = muli %workgroup_id_y, %c8 : index
    %c32 = constant 32 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %c16 = constant 16 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %c4_1 = constant 4 : index
    %10 = muli %8, %c4_1 : index
    %c4_2 = constant 4 : index
    %11 = muli %7, %c4_2 : index
    %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %13 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %14 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %c4_3 = constant 4 : index
    %15 = muli %13, %c4_3 : index
    %c4_4 = constant 4 : index
    %16 = muli %12, %c4_4 : index
    %c2_5 = constant 2 : index
    %17 = muli %14, %c2_5 : index
    %c8_6 = constant 8 : index
    %18 = muli %13, %c8_6 : index
    %19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %c2_7 = constant 2 : index
      %20 = muli %arg0, %c2_7 : index
      %21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %40 = vector.extract %38[0, 0] : vector<1x1xf32>
          %41 = splat %40 : vector<4xf32>
          %42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %43 = vector.fma %41, %39, %42 : vector<4xf32>
          %44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %46 = vector.extract %44[0, 0] : vector<1x1xf32>
          %47 = splat %46 : vector<4xf32>
          %48 = vector.fma %47, %45, %43 : vector<4xf32>
          %49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %51 = vector.extract %49[0, 0] : vector<1x1xf32>
          %52 = splat %51 : vector<4xf32>
          %53 = vector.fma %52, %50, %48 : vector<4xf32>
          %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
          %55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %58 = vector.extract %56[0, 0] : vector<1x1xf32>
          %59 = splat %58 : vector<4xf32>
          %60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %61 = vector.fma %59, %57, %60 : vector<4xf32>
          %62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %64 = vector.extract %62[0, 0] : vector<1x1xf32>
          %65 = splat %64 : vector<4xf32>
          %66 = vector.fma %65, %63, %61 : vector<4xf32>
          %67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %69 = vector.extract %67[0, 0] : vector<1x1xf32>
          %70 = splat %69 : vector<4xf32>
          %71 = vector.fma %70, %68, %66 : vector<4xf32>
          %72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
          %73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %76 = vector.extract %74[0, 0] : vector<1x1xf32>
          %77 = splat %76 : vector<4xf32>
          %78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %79 = vector.fma %77, %75, %78 : vector<4xf32>
          %80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %82 = vector.extract %80[0, 0] : vector<1x1xf32>
          %83 = splat %82 : vector<4xf32>
          %84 = vector.fma %83, %81, %79 : vector<4xf32>
          %85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %87 = vector.extract %85[0, 0] : vector<1x1xf32>
          %88 = splat %87 : vector<4xf32>
          %89 = vector.fma %88, %86, %84 : vector<4xf32>
          %90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
          %91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %94 = vector.extract %92[0, 0] : vector<1x1xf32>
          %95 = splat %94 : vector<4xf32>
          %96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %97 = vector.fma %95, %93, %96 : vector<4xf32>
          %98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %100 = vector.extract %98[0, 0] : vector<1x1xf32>
          %101 = splat %100 : vector<4xf32>
          %102 = vector.fma %101, %99, %97 : vector<4xf32>
          %103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %105 = vector.extract %103[0, 0] : vector<1x1xf32>
          %106 = splat %105 : vector<4xf32>
          %107 = vector.fma %106, %104, %102 : vector<4xf32>
          %108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
          scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After Canonicalizer //----- //
module  {
  func @conv_dispatch_0() {
    %c8 = constant 8 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c16 = constant 16 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c112 = constant 112 : index
    %c3 = constant 3 : index
    %c1 = constant 1 : index
    %c6 = constant 6 : index
    %cst = constant 0.000000e+00 : f32
    %cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %10 = muli %8, %c4 : index
    %11 = muli %7, %c4 : index
    %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %13 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %14 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %15 = muli %13, %c4 : index
    %16 = muli %12, %c4 : index
    %17 = muli %14, %c2 : index
    %18 = muli %13, %c8 : index
    %19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %20 = muli %arg0, %c2 : index
      %21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %40 = vector.extract %38[0, 0] : vector<1x1xf32>
          %41 = splat %40 : vector<4xf32>
          %42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %43 = vector.fma %41, %39, %42 : vector<4xf32>
          %44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %46 = vector.extract %44[0, 0] : vector<1x1xf32>
          %47 = splat %46 : vector<4xf32>
          %48 = vector.fma %47, %45, %43 : vector<4xf32>
          %49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %51 = vector.extract %49[0, 0] : vector<1x1xf32>
          %52 = splat %51 : vector<4xf32>
          %53 = vector.fma %52, %50, %48 : vector<4xf32>
          %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
          %55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %58 = vector.extract %56[0, 0] : vector<1x1xf32>
          %59 = splat %58 : vector<4xf32>
          %60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %61 = vector.fma %59, %57, %60 : vector<4xf32>
          %62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %64 = vector.extract %62[0, 0] : vector<1x1xf32>
          %65 = splat %64 : vector<4xf32>
          %66 = vector.fma %65, %63, %61 : vector<4xf32>
          %67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %69 = vector.extract %67[0, 0] : vector<1x1xf32>
          %70 = splat %69 : vector<4xf32>
          %71 = vector.fma %70, %68, %66 : vector<4xf32>
          %72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
          %73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %76 = vector.extract %74[0, 0] : vector<1x1xf32>
          %77 = splat %76 : vector<4xf32>
          %78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %79 = vector.fma %77, %75, %78 : vector<4xf32>
          %80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %82 = vector.extract %80[0, 0] : vector<1x1xf32>
          %83 = splat %82 : vector<4xf32>
          %84 = vector.fma %83, %81, %79 : vector<4xf32>
          %85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %87 = vector.extract %85[0, 0] : vector<1x1xf32>
          %88 = splat %87 : vector<4xf32>
          %89 = vector.fma %88, %86, %84 : vector<4xf32>
          %90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
          %91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
          %94 = vector.extract %92[0, 0] : vector<1x1xf32>
          %95 = splat %94 : vector<4xf32>
          %96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %97 = vector.fma %95, %93, %96 : vector<4xf32>
          %98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
          %100 = vector.extract %98[0, 0] : vector<1x1xf32>
          %101 = splat %100 : vector<4xf32>
          %102 = vector.fma %101, %99, %97 : vector<4xf32>
          %103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
          %105 = vector.extract %103[0, 0] : vector<1x1xf32>
          %106 = splat %105 : vector<4xf32>
          %107 = vector.fma %106, %104, %102 : vector<4xf32>
          %108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
          scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After CSE //----- //
module  {
  func @conv_dispatch_0() {
    %c8 = constant 8 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c16 = constant 16 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c112 = constant 112 : index
    %c3 = constant 3 : index
    %c1 = constant 1 : index
    %c6 = constant 6 : index
    %cst = constant 0.000000e+00 : f32
    %cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %10 = muli %8, %c4 : index
    %11 = muli %7, %c4 : index
    %12 = muli %9, %c2 : index
    %13 = muli %8, %c8 : index
    %14 = memref.subview %6[0, 0, 0, %11] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %15 = muli %arg0, %c2 : index
      %16 = memref.subview %0[0, %15, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %17 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %18 = memref.subview %17[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %18[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %18[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %18[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %cst_0, %18[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %19 = memref.subview %16[0, %12, %13, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %20 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %21 = vector.transfer_read %18[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %22 = vector.transfer_read %18[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %23 = vector.transfer_read %18[%c0, %c0, %c3, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %24:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %20, %arg3 = %21, %arg4 = %22, %arg5 = %23) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %25:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %26 = memref.subview %19[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %27 = memref.subview %14[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %29 = vector.transfer_read %27[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %30 = vector.transfer_read %27[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %31 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %32 = vector.extract_strided_slice %31 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %33 = vector.shape_cast %28 : vector<1x4xf32> to vector<4xf32>
          %34 = vector.extract %32[0, 0] : vector<1x1xf32>
          %35 = splat %34 : vector<4xf32>
          %36 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %37 = vector.fma %35, %33, %36 : vector<4xf32>
          %38 = vector.extract_strided_slice %31 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %39 = vector.shape_cast %29 : vector<1x4xf32> to vector<4xf32>
          %40 = vector.extract %38[0, 0] : vector<1x1xf32>
          %41 = splat %40 : vector<4xf32>
          %42 = vector.fma %41, %39, %37 : vector<4xf32>
          %43 = vector.extract_strided_slice %31 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %44 = vector.shape_cast %30 : vector<1x4xf32> to vector<4xf32>
          %45 = vector.extract %43[0, 0] : vector<1x1xf32>
          %46 = splat %45 : vector<4xf32>
          %47 = vector.fma %46, %44, %42 : vector<4xf32>
          %48 = vector.shape_cast %47 : vector<4xf32> to vector<1x4xf32>
          %49 = vector.transfer_read %26[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %50 = vector.extract_strided_slice %49 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %51 = vector.extract %50[0, 0] : vector<1x1xf32>
          %52 = splat %51 : vector<4xf32>
          %53 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %54 = vector.fma %52, %33, %53 : vector<4xf32>
          %55 = vector.extract_strided_slice %49 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %56 = vector.extract %55[0, 0] : vector<1x1xf32>
          %57 = splat %56 : vector<4xf32>
          %58 = vector.fma %57, %39, %54 : vector<4xf32>
          %59 = vector.extract_strided_slice %49 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %60 = vector.extract %59[0, 0] : vector<1x1xf32>
          %61 = splat %60 : vector<4xf32>
          %62 = vector.fma %61, %44, %58 : vector<4xf32>
          %63 = vector.shape_cast %62 : vector<4xf32> to vector<1x4xf32>
          %64 = vector.transfer_read %26[%c0, %c0, %c4, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %66 = vector.extract %65[0, 0] : vector<1x1xf32>
          %67 = splat %66 : vector<4xf32>
          %68 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %69 = vector.fma %67, %33, %68 : vector<4xf32>
          %70 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %71 = vector.extract %70[0, 0] : vector<1x1xf32>
          %72 = splat %71 : vector<4xf32>
          %73 = vector.fma %72, %39, %69 : vector<4xf32>
          %74 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %75 = vector.extract %74[0, 0] : vector<1x1xf32>
          %76 = splat %75 : vector<4xf32>
          %77 = vector.fma %76, %44, %73 : vector<4xf32>
          %78 = vector.shape_cast %77 : vector<4xf32> to vector<1x4xf32>
          %79 = vector.transfer_read %26[%c0, %c0, %c6, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %80 = vector.extract_strided_slice %79 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %81 = vector.extract %80[0, 0] : vector<1x1xf32>
          %82 = splat %81 : vector<4xf32>
          %83 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %84 = vector.fma %82, %33, %83 : vector<4xf32>
          %85 = vector.extract_strided_slice %79 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %86 = vector.extract %85[0, 0] : vector<1x1xf32>
          %87 = splat %86 : vector<4xf32>
          %88 = vector.fma %87, %39, %84 : vector<4xf32>
          %89 = vector.extract_strided_slice %79 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %90 = vector.extract %89[0, 0] : vector<1x1xf32>
          %91 = splat %90 : vector<4xf32>
          %92 = vector.fma %91, %44, %88 : vector<4xf32>
          %93 = vector.shape_cast %92 : vector<4xf32> to vector<1x4xf32>
          scf.yield %48, %63, %78, %93 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %25#0, %25#1, %25#2, %25#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      vector.transfer_write %24#3, %18[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %24#2, %18[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %24#1, %18[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %24#0, %18[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After OptimizeVectorTransfer //----- //
func @conv_dispatch_0() {
  %c8 = constant 8 : index
  %c2 = constant 2 : index
  %c4 = constant 4 : index
  %c16 = constant 16 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c112 = constant 112 : index
  %c3 = constant 3 : index
  %c1 = constant 1 : index
  %c6 = constant 6 : index
  %cst = constant 0.000000e+00 : f32
  %cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = muli %workgroup_id_y, %c8 : index
  %4 = muli %workgroup_id_x, %c32 : index
  %5 = muli %workgroup_id_y, %c16 : index
  %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %10 = muli %8, %c4 : index
  %11 = muli %7, %c4 : index
  %12 = muli %9, %c2 : index
  %13 = muli %8, %c8 : index
  %14 = memref.subview %6[0, 0, 0, %11] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
  %16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
  %17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
  %18 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %19 = muli %arg0, %c2 : index
    %20 = memref.subview %0[0, %19, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %21 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %22 = memref.subview %21[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %23 = memref.subview %20[0, %12, %13, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %24 = vector.shape_cast %15 : vector<4xf32> to vector<1x4xf32>
    %25 = vector.shape_cast %16 : vector<4xf32> to vector<1x4xf32>
    %26 = vector.shape_cast %17 : vector<4xf32> to vector<1x4xf32>
    %27 = vector.shape_cast %18 : vector<4xf32> to vector<1x4xf32>
    %28:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %24, %arg3 = %25, %arg4 = %26, %arg5 = %27) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %33:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %34 = memref.subview %23[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %35 = memref.subview %14[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
        %37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
        %38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
        %39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
        %40 = vector.extract_strided_slice %39 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %41 = vector.extract %40[0] : vector<1xf32>
        %42 = splat %41 : vector<4xf32>
        %43 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %44 = vector.fma %42, %36, %43 : vector<4xf32>
        %45 = vector.extract_strided_slice %39 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %46 = vector.extract %45[0] : vector<1xf32>
        %47 = splat %46 : vector<4xf32>
        %48 = vector.fma %47, %37, %44 : vector<4xf32>
        %49 = vector.extract_strided_slice %39 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %50 = vector.extract %49[0] : vector<1xf32>
        %51 = splat %50 : vector<4xf32>
        %52 = vector.fma %51, %38, %48 : vector<4xf32>
        %53 = vector.shape_cast %52 : vector<4xf32> to vector<1x4xf32>
        %54 = vector.transfer_read %34[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
        %55 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %56 = vector.extract %55[0] : vector<1xf32>
        %57 = splat %56 : vector<4xf32>
        %58 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %59 = vector.fma %57, %36, %58 : vector<4xf32>
        %60 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %61 = vector.extract %60[0] : vector<1xf32>
        %62 = splat %61 : vector<4xf32>
        %63 = vector.fma %62, %37, %59 : vector<4xf32>
        %64 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %65 = vector.extract %64[0] : vector<1xf32>
        %66 = splat %65 : vector<4xf32>
        %67 = vector.fma %66, %38, %63 : vector<4xf32>
        %68 = vector.shape_cast %67 : vector<4xf32> to vector<1x4xf32>
        %69 = vector.transfer_read %34[%c0, %c0, %c4, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
        %70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %71 = vector.extract %70[0] : vector<1xf32>
        %72 = splat %71 : vector<4xf32>
        %73 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %74 = vector.fma %72, %36, %73 : vector<4xf32>
        %75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %76 = vector.extract %75[0] : vector<1xf32>
        %77 = splat %76 : vector<4xf32>
        %78 = vector.fma %77, %37, %74 : vector<4xf32>
        %79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %80 = vector.extract %79[0] : vector<1xf32>
        %81 = splat %80 : vector<4xf32>
        %82 = vector.fma %81, %38, %78 : vector<4xf32>
        %83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
        %84 = vector.transfer_read %34[%c0, %c0, %c6, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
        %85 = vector.extract_strided_slice %84 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %86 = vector.extract %85[0] : vector<1xf32>
        %87 = splat %86 : vector<4xf32>
        %88 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
        %89 = vector.fma %87, %36, %88 : vector<4xf32>
        %90 = vector.extract_strided_slice %84 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %91 = vector.extract %90[0] : vector<1xf32>
        %92 = splat %91 : vector<4xf32>
        %93 = vector.fma %92, %37, %89 : vector<4xf32>
        %94 = vector.extract_strided_slice %84 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %95 = vector.extract %94[0] : vector<1xf32>
        %96 = splat %95 : vector<4xf32>
        %97 = vector.fma %96, %38, %93 : vector<4xf32>
        %98 = vector.shape_cast %97 : vector<4xf32> to vector<1x4xf32>
        scf.yield %53, %68, %83, %98 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    %29 = vector.shape_cast %28#3 : vector<1x4xf32> to vector<4xf32>
    vector.transfer_write %29, %22[%c0, %c0, %c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %30 = vector.shape_cast %28#2 : vector<1x4xf32> to vector<4xf32>
    vector.transfer_write %30, %22[%c0, %c0, %c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %31 = vector.shape_cast %28#1 : vector<1x4xf32> to vector<4xf32>
    vector.transfer_write %31, %22[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %32 = vector.shape_cast %28#0 : vector<1x4xf32> to vector<4xf32>
    vector.transfer_write %32, %22[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  }
  return
}

// -----// IR Dump After SPIRVLowerExecutableTarget //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
  hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
    %c1 = constant 1 : index
    %c14 = constant 14 : index
    %c112 = constant 112 : index
    hal.return %c1, %c14, %c112 : index, index, index
  }
  builtin.module  {
    func @conv_dispatch_0() {
      %c8 = constant 8 : index
      %c2 = constant 2 : index
      %c4 = constant 4 : index
      %c16 = constant 16 : index
      %c32 = constant 32 : index
      %c0 = constant 0 : index
      %c112 = constant 112 : index
      %c3 = constant 3 : index
      %c1 = constant 1 : index
      %c6 = constant 6 : index
      %cst = constant 0.000000e+00 : f32
      %cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
      %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
      %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
      %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_id_z = hal.interface.workgroup.id[2] : index
      %3 = muli %workgroup_id_y, %c8 : index
      %4 = muli %workgroup_id_x, %c32 : index
      %5 = muli %workgroup_id_y, %c16 : index
      %6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %7 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %8 = "gpu.thread_id"() {dimension = "y"} : () -> index
      %9 = "gpu.thread_id"() {dimension = "z"} : () -> index
      %10 = muli %8, %c4 : index
      %11 = muli %7, %c4 : index
      %12 = muli %9, %c2 : index
      %13 = muli %8, %c8 : index
      %14 = memref.subview %6[0, 0, 0, %11] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
      %16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
      %17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
      %18 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
      scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
        %19 = muli %arg0, %c2 : index
        %20 = memref.subview %0[0, %19, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %21 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        %22 = memref.subview %21[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        %23 = memref.subview %20[0, %12, %13, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %24 = vector.shape_cast %15 : vector<4xf32> to vector<1x4xf32>
        %25 = vector.shape_cast %16 : vector<4xf32> to vector<1x4xf32>
        %26 = vector.shape_cast %17 : vector<4xf32> to vector<1x4xf32>
        %27 = vector.shape_cast %18 : vector<4xf32> to vector<1x4xf32>
        %28:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %24, %arg3 = %25, %arg4 = %26, %arg5 = %27) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %33:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
            %34 = memref.subview %23[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
            %35 = memref.subview %14[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
            %36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
            %37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
            %38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
            %39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
            %40 = vector.extract_strided_slice %39 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %41 = vector.extract %40[0] : vector<1xf32>
            %42 = splat %41 : vector<4xf32>
            %43 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
            %44 = vector.fma %42, %36, %43 : vector<4xf32>
            %45 = vector.extract_strided_slice %39 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %46 = vector.extract %45[0] : vector<1xf32>
            %47 = splat %46 : vector<4xf32>
            %48 = vector.fma %47, %37, %44 : vector<4xf32>
            %49 = vector.extract_strided_slice %39 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %50 = vector.extract %49[0] : vector<1xf32>
            %51 = splat %50 : vector<4xf32>
            %52 = vector.fma %51, %38, %48 : vector<4xf32>
            %53 = vector.shape_cast %52 : vector<4xf32> to vector<1x4xf32>
            %54 = vector.transfer_read %34[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
            %55 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %56 = vector.extract %55[0] : vector<1xf32>
            %57 = splat %56 : vector<4xf32>
            %58 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
            %59 = vector.fma %57, %36, %58 : vector<4xf32>
            %60 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %61 = vector.extract %60[0] : vector<1xf32>
            %62 = splat %61 : vector<4xf32>
            %63 = vector.fma %62, %37, %59 : vector<4xf32>
            %64 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %65 = vector.extract %64[0] : vector<1xf32>
            %66 = splat %65 : vector<4xf32>
            %67 = vector.fma %66, %38, %63 : vector<4xf32>
            %68 = vector.shape_cast %67 : vector<4xf32> to vector<1x4xf32>
            %69 = vector.transfer_read %34[%c0, %c0, %c4, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
            %70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %71 = vector.extract %70[0] : vector<1xf32>
            %72 = splat %71 : vector<4xf32>
            %73 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
            %74 = vector.fma %72, %36, %73 : vector<4xf32>
            %75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %76 = vector.extract %75[0] : vector<1xf32>
            %77 = splat %76 : vector<4xf32>
            %78 = vector.fma %77, %37, %74 : vector<4xf32>
            %79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %80 = vector.extract %79[0] : vector<1xf32>
            %81 = splat %80 : vector<4xf32>
            %82 = vector.fma %81, %38, %78 : vector<4xf32>
            %83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
            %84 = vector.transfer_read %34[%c0, %c0, %c6, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
            %85 = vector.extract_strided_slice %84 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %86 = vector.extract %85[0] : vector<1xf32>
            %87 = splat %86 : vector<4xf32>
            %88 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
            %89 = vector.fma %87, %36, %88 : vector<4xf32>
            %90 = vector.extract_strided_slice %84 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %91 = vector.extract %90[0] : vector<1xf32>
            %92 = splat %91 : vector<4xf32>
            %93 = vector.fma %92, %37, %89 : vector<4xf32>
            %94 = vector.extract_strided_slice %84 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
            %95 = vector.extract %94[0] : vector<1xf32>
            %96 = splat %95 : vector<4xf32>
            %97 = vector.fma %96, %38, %93 : vector<4xf32>
            %98 = vector.shape_cast %97 : vector<4xf32> to vector<1x4xf32>
            scf.yield %53, %68, %83, %98 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
          }
          scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        %29 = vector.shape_cast %28#3 : vector<1x4xf32> to vector<4xf32>
        vector.transfer_write %29, %22[%c0, %c0, %c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        %30 = vector.shape_cast %28#2 : vector<1x4xf32> to vector<4xf32>
        vector.transfer_write %30, %22[%c0, %c0, %c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        %31 = vector.shape_cast %28#1 : vector<1x4xf32> to vector<4xf32>
        vector.transfer_write %31, %22[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        %32 = vector.shape_cast %28#0 : vector<1x4xf32> to vector<4xf32>
        vector.transfer_write %32, %22[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      }
      return
    }
    hal.interface private @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}

// -----// IR Dump After FoldSubViewOps //----- //
module  {
  func @conv_dispatch_0() {
    %c8 = constant 8 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c16 = constant 16 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c112 = constant 112 : index
    %c3 = constant 3 : index
    %c1 = constant 1 : index
    %c6 = constant 6 : index
    %cst = constant 0.000000e+00 : f32
    %cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
      %15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
      %16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
      %17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
      %18:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %14, %arg3 = %15, %arg4 = %16, %arg5 = %17) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %47:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %48 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %49 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
          %50 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
          %51 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%50)[%4]
          %52 = vector.transfer_read %1[%48, %49, %c0, %51], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %53 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %54 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
          %55 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
          %56 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%55)[%4]
          %57 = vector.transfer_read %1[%53, %54, %c1, %56], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %58 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %59 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
          %60 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
          %61 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%60)[%4]
          %62 = vector.transfer_read %1[%58, %59, %c2, %61], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %63 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %64 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
          %65 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%63)[%11]
          %66 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%64)[%12]
          %67 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%65)[%13]
          %68 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%66)[%5]
          %69 = vector.transfer_read %0[%c0, %67, %68, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %71 = vector.extract %70[0] : vector<1xf32>
          %72 = splat %71 : vector<4xf32>
          %73 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %74 = vector.fma %72, %52, %73 : vector<4xf32>
          %75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %76 = vector.extract %75[0] : vector<1xf32>
          %77 = splat %76 : vector<4xf32>
          %78 = vector.fma %77, %57, %74 : vector<4xf32>
          %79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %80 = vector.extract %79[0] : vector<1xf32>
          %81 = splat %80 : vector<4xf32>
          %82 = vector.fma %81, %62, %78 : vector<4xf32>
          %83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
          %84 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %85 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%arg6]
          %86 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%84)[%11]
          %87 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%85)[%12]
          %88 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%86)[%13]
          %89 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%87)[%5]
          %90 = vector.transfer_read %0[%c0, %88, %89, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %91 = vector.extract_strided_slice %90 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %92 = vector.extract %91[0] : vector<1xf32>
          %93 = splat %92 : vector<4xf32>
          %94 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %95 = vector.fma %93, %52, %94 : vector<4xf32>
          %96 = vector.extract_strided_slice %90 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %97 = vector.extract %96[0] : vector<1xf32>
          %98 = splat %97 : vector<4xf32>
          %99 = vector.fma %98, %57, %95 : vector<4xf32>
          %100 = vector.extract_strided_slice %90 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %101 = vector.extract %100[0] : vector<1xf32>
          %102 = splat %101 : vector<4xf32>
          %103 = vector.fma %102, %62, %99 : vector<4xf32>
          %104 = vector.shape_cast %103 : vector<4xf32> to vector<1x4xf32>
          %105 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %106 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c4)[%arg6]
          %107 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%105)[%11]
          %108 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%106)[%12]
          %109 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%107)[%13]
          %110 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%108)[%5]
          %111 = vector.transfer_read %0[%c0, %109, %110, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %112 = vector.extract_strided_slice %111 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %113 = vector.extract %112[0] : vector<1xf32>
          %114 = splat %113 : vector<4xf32>
          %115 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %116 = vector.fma %114, %52, %115 : vector<4xf32>
          %117 = vector.extract_strided_slice %111 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %118 = vector.extract %117[0] : vector<1xf32>
          %119 = splat %118 : vector<4xf32>
          %120 = vector.fma %119, %57, %116 : vector<4xf32>
          %121 = vector.extract_strided_slice %111 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %122 = vector.extract %121[0] : vector<1xf32>
          %123 = splat %122 : vector<4xf32>
          %124 = vector.fma %123, %62, %120 : vector<4xf32>
          %125 = vector.shape_cast %124 : vector<4xf32> to vector<1x4xf32>
          %126 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
          %127 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c6)[%arg6]
          %128 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%126)[%11]
          %129 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%127)[%12]
          %130 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%128)[%13]
          %131 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%129)[%5]
          %132 = vector.transfer_read %0[%c0, %130, %131, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %133 = vector.extract_strided_slice %132 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %134 = vector.extract %133[0] : vector<1xf32>
          %135 = splat %134 : vector<4xf32>
          %136 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %137 = vector.fma %135, %52, %136 : vector<4xf32>
          %138 = vector.extract_strided_slice %132 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %139 = vector.extract %138[0] : vector<1xf32>
          %140 = splat %139 : vector<4xf32>
          %141 = vector.fma %140, %57, %137 : vector<4xf32>
          %142 = vector.extract_strided_slice %132 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %143 = vector.extract %142[0] : vector<1xf32>
          %144 = splat %143 : vector<4xf32>
          %145 = vector.fma %144, %62, %141 : vector<4xf32>
          %146 = vector.shape_cast %145 : vector<4xf32> to vector<1x4xf32>
          scf.yield %83, %104, %125, %146 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %47#0, %47#1, %47#2, %47#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      %19 = vector.shape_cast %18#3 : vector<1x4xf32> to vector<4xf32>
      %20 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
      %21 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c3)[%9]
      %22 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
      %23 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%20)[%arg0]
      %24 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%21)[%3]
      %25 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%22)[%4]
      vector.transfer_write %19, %2[%c0, %23, %24, %25] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %26 = vector.shape_cast %18#2 : vector<1x4xf32> to vector<4xf32>
      %27 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
      %28 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%9]
      %29 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
      %30 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%27)[%arg0]
      %31 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%28)[%3]
      %32 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%29)[%4]
      vector.transfer_write %26, %2[%c0, %30, %31, %32] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %33 = vector.shape_cast %18#1 : vector<1x4xf32> to vector<4xf32>
      %34 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
      %35 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c1)[%9]
      %36 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
      %37 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%34)[%arg0]
      %38 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%35)[%3]
      %39 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%36)[%4]
      vector.transfer_write %33, %2[%c0, %37, %38, %39] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %40 = vector.shape_cast %18#0 : vector<1x4xf32> to vector<4xf32>
      %41 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
      %42 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%9]
      %43 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
      %44 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%41)[%arg0]
      %45 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%42)[%3]
      %46 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%43)[%4]
      vector.transfer_write %40, %2[%c0, %44, %45, %46] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::FoldDimOverShapeCarryingOpPass //----- //
func @conv_dispatch_0() {
  %c8 = constant 8 : index
  %c2 = constant 2 : index
  %c4 = constant 4 : index
  %c16 = constant 16 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c112 = constant 112 : index
  %c3 = constant 3 : index
  %c1 = constant 1 : index
  %c6 = constant 6 : index
  %cst = constant 0.000000e+00 : f32
  %cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = muli %workgroup_id_y, %c8 : index
  %4 = muli %workgroup_id_x, %c32 : index
  %5 = muli %workgroup_id_y, %c16 : index
  %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %9 = muli %7, %c4 : index
  %10 = muli %6, %c4 : index
  %11 = muli %8, %c2 : index
  %12 = muli %7, %c8 : index
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %13 = muli %arg0, %c2 : index
    %14 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
    %15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
    %16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
    %17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
    %18:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %14, %arg3 = %15, %arg4 = %16, %arg5 = %17) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %47:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %48 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %49 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
        %50 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
        %51 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%50)[%4]
        %52 = vector.transfer_read %1[%48, %49, %c0, %51], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
        %53 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %54 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
        %55 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
        %56 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%55)[%4]
        %57 = vector.transfer_read %1[%53, %54, %c1, %56], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
        %58 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %59 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
        %60 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
        %61 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%60)[%4]
        %62 = vector.transfer_read %1[%58, %59, %c2, %61], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
        %63 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %64 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
        %65 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%63)[%11]
        %66 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%64)[%12]
        %67 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%65)[%13]
        %68 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%66)[%5]
        %69 = vector.transfer_read %0[%c0, %67, %68, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
        %70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %71 = vector.extract %70[0] : vector<1xf32>
        %72 = splat %71 : vector<4xf32>
        %73 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %74 = vector.fma %72, %52, %73 : vector<4xf32>
        %75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %76 = vector.extract %75[0] : vector<1xf32>
        %77 = splat %76 : vector<4xf32>
        %78 = vector.fma %77, %57, %74 : vector<4xf32>
        %79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %80 = vector.extract %79[0] : vector<1xf32>
        %81 = splat %80 : vector<4xf32>
        %82 = vector.fma %81, %62, %78 : vector<4xf32>
        %83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
        %84 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %85 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%arg6]
        %86 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%84)[%11]
        %87 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%85)[%12]
        %88 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%86)[%13]
        %89 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%87)[%5]
        %90 = vector.transfer_read %0[%c0, %88, %89, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
        %91 = vector.extract_strided_slice %90 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %92 = vector.extract %91[0] : vector<1xf32>
        %93 = splat %92 : vector<4xf32>
        %94 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %95 = vector.fma %93, %52, %94 : vector<4xf32>
        %96 = vector.extract_strided_slice %90 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %97 = vector.extract %96[0] : vector<1xf32>
        %98 = splat %97 : vector<4xf32>
        %99 = vector.fma %98, %57, %95 : vector<4xf32>
        %100 = vector.extract_strided_slice %90 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %101 = vector.extract %100[0] : vector<1xf32>
        %102 = splat %101 : vector<4xf32>
        %103 = vector.fma %102, %62, %99 : vector<4xf32>
        %104 = vector.shape_cast %103 : vector<4xf32> to vector<1x4xf32>
        %105 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %106 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c4)[%arg6]
        %107 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%105)[%11]
        %108 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%106)[%12]
        %109 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%107)[%13]
        %110 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%108)[%5]
        %111 = vector.transfer_read %0[%c0, %109, %110, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
        %112 = vector.extract_strided_slice %111 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %113 = vector.extract %112[0] : vector<1xf32>
        %114 = splat %113 : vector<4xf32>
        %115 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %116 = vector.fma %114, %52, %115 : vector<4xf32>
        %117 = vector.extract_strided_slice %111 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %118 = vector.extract %117[0] : vector<1xf32>
        %119 = splat %118 : vector<4xf32>
        %120 = vector.fma %119, %57, %116 : vector<4xf32>
        %121 = vector.extract_strided_slice %111 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %122 = vector.extract %121[0] : vector<1xf32>
        %123 = splat %122 : vector<4xf32>
        %124 = vector.fma %123, %62, %120 : vector<4xf32>
        %125 = vector.shape_cast %124 : vector<4xf32> to vector<1x4xf32>
        %126 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
        %127 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c6)[%arg6]
        %128 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%126)[%11]
        %129 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%127)[%12]
        %130 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%128)[%13]
        %131 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%129)[%5]
        %132 = vector.transfer_read %0[%c0, %130, %131, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
        %133 = vector.extract_strided_slice %132 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %134 = vector.extract %133[0] : vector<1xf32>
        %135 = splat %134 : vector<4xf32>
        %136 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
        %137 = vector.fma %135, %52, %136 : vector<4xf32>
        %138 = vector.extract_strided_slice %132 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %139 = vector.extract %138[0] : vector<1xf32>
        %140 = splat %139 : vector<4xf32>
        %141 = vector.fma %140, %57, %137 : vector<4xf32>
        %142 = vector.extract_strided_slice %132 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %143 = vector.extract %142[0] : vector<1xf32>
        %144 = splat %143 : vector<4xf32>
        %145 = vector.fma %144, %62, %141 : vector<4xf32>
        %146 = vector.shape_cast %145 : vector<4xf32> to vector<1x4xf32>
        scf.yield %83, %104, %125, %146 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %47#0, %47#1, %47#2, %47#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    %19 = vector.shape_cast %18#3 : vector<1x4xf32> to vector<4xf32>
    %20 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
    %21 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c3)[%9]
    %22 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
    %23 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%20)[%arg0]
    %24 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%21)[%3]
    %25 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%22)[%4]
    vector.transfer_write %19, %2[%c0, %23, %24, %25] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
    %26 = vector.shape_cast %18#2 : vector<1x4xf32> to vector<4xf32>
    %27 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
    %28 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%9]
    %29 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
    %30 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%27)[%arg0]
    %31 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%28)[%3]
    %32 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%29)[%4]
    vector.transfer_write %26, %2[%c0, %30, %31, %32] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
    %33 = vector.shape_cast %18#1 : vector<1x4xf32> to vector<4xf32>
    %34 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
    %35 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c1)[%9]
    %36 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
    %37 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%34)[%arg0]
    %38 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%35)[%3]
    %39 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%36)[%4]
    vector.transfer_write %33, %2[%c0, %37, %38, %39] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
    %40 = vector.shape_cast %18#0 : vector<1x4xf32> to vector<4xf32>
    %41 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
    %42 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%9]
    %43 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
    %44 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%41)[%arg0]
    %45 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%42)[%3]
    %46 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%43)[%4]
    vector.transfer_write %40, %2[%c0, %44, %45, %46] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
  }
  return
}

// -----// IR Dump After Canonicalizer //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<1x4xf32>
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c16 = constant 16 : index
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %33 = vector.transfer_read %1[%arg1, %arg6, %c0, %32], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %34 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %35 = vector.transfer_read %1[%arg1, %arg6, %c1, %34], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %36 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %37 = vector.transfer_read %1[%arg1, %arg6, %c2, %36], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %39 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
          %40 = vector.transfer_read %0[%c0, %38, %39, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %41 = vector.extract_strided_slice %40 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %42 = vector.extract %41[0] : vector<1xf32>
          %43 = splat %42 : vector<4xf32>
          %44 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %45 = vector.fma %43, %33, %44 : vector<4xf32>
          %46 = vector.extract_strided_slice %40 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %47 = vector.extract %46[0] : vector<1xf32>
          %48 = splat %47 : vector<4xf32>
          %49 = vector.fma %48, %35, %45 : vector<4xf32>
          %50 = vector.extract_strided_slice %40 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %51 = vector.extract %50[0] : vector<1xf32>
          %52 = splat %51 : vector<4xf32>
          %53 = vector.fma %52, %37, %49 : vector<4xf32>
          %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
          %55 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %56 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
          %57 = vector.transfer_read %0[%c0, %55, %56, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %58 = vector.extract_strided_slice %57 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %59 = vector.extract %58[0] : vector<1xf32>
          %60 = splat %59 : vector<4xf32>
          %61 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %62 = vector.fma %60, %33, %61 : vector<4xf32>
          %63 = vector.extract_strided_slice %57 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %64 = vector.extract %63[0] : vector<1xf32>
          %65 = splat %64 : vector<4xf32>
          %66 = vector.fma %65, %35, %62 : vector<4xf32>
          %67 = vector.extract_strided_slice %57 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %68 = vector.extract %67[0] : vector<1xf32>
          %69 = splat %68 : vector<4xf32>
          %70 = vector.fma %69, %37, %66 : vector<4xf32>
          %71 = vector.shape_cast %70 : vector<4xf32> to vector<1x4xf32>
          %72 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %73 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
          %74 = vector.transfer_read %0[%c0, %72, %73, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %75 = vector.extract_strided_slice %74 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %76 = vector.extract %75[0] : vector<1xf32>
          %77 = splat %76 : vector<4xf32>
          %78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %79 = vector.fma %77, %33, %78 : vector<4xf32>
          %80 = vector.extract_strided_slice %74 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %81 = vector.extract %80[0] : vector<1xf32>
          %82 = splat %81 : vector<4xf32>
          %83 = vector.fma %82, %35, %79 : vector<4xf32>
          %84 = vector.extract_strided_slice %74 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %85 = vector.extract %84[0] : vector<1xf32>
          %86 = splat %85 : vector<4xf32>
          %87 = vector.fma %86, %37, %83 : vector<4xf32>
          %88 = vector.shape_cast %87 : vector<4xf32> to vector<1x4xf32>
          %89 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %90 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
          %91 = vector.transfer_read %0[%c0, %89, %90, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %92 = vector.extract_strided_slice %91 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %93 = vector.extract %92[0] : vector<1xf32>
          %94 = splat %93 : vector<4xf32>
          %95 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %96 = vector.fma %94, %33, %95 : vector<4xf32>
          %97 = vector.extract_strided_slice %91 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %98 = vector.extract %97[0] : vector<1xf32>
          %99 = splat %98 : vector<4xf32>
          %100 = vector.fma %99, %35, %96 : vector<4xf32>
          %101 = vector.extract_strided_slice %91 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %102 = vector.extract %101[0] : vector<1xf32>
          %103 = splat %102 : vector<4xf32>
          %104 = vector.fma %103, %37, %100 : vector<4xf32>
          %105 = vector.shape_cast %104 : vector<4xf32> to vector<1x4xf32>
          scf.yield %54, %71, %88, %105 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      %15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
      %18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      vector.transfer_write %15, %2[%c0, %16, %17, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %19 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
      %20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
      %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      vector.transfer_write %19, %2[%c0, %20, %21, %22] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %23 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
      %24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
      %26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      vector.transfer_write %23, %2[%c0, %24, %25, %26] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %27 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
      %28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %29 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
      %30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      vector.transfer_write %27, %2[%c0, %28, %29, %30] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After CSE //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<1x4xf32>
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c16 = constant 16 : index
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %25:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %27 = vector.transfer_read %1[%arg1, %arg6, %c0, %26], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %28 = vector.transfer_read %1[%arg1, %arg6, %c1, %26], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %29 = vector.transfer_read %1[%arg1, %arg6, %c2, %26], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
          %30 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %31 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
          %32 = vector.transfer_read %0[%c0, %30, %31, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %33 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %34 = vector.extract %33[0] : vector<1xf32>
          %35 = splat %34 : vector<4xf32>
          %36 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %37 = vector.fma %35, %27, %36 : vector<4xf32>
          %38 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %39 = vector.extract %38[0] : vector<1xf32>
          %40 = splat %39 : vector<4xf32>
          %41 = vector.fma %40, %28, %37 : vector<4xf32>
          %42 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %43 = vector.extract %42[0] : vector<1xf32>
          %44 = splat %43 : vector<4xf32>
          %45 = vector.fma %44, %29, %41 : vector<4xf32>
          %46 = vector.shape_cast %45 : vector<4xf32> to vector<1x4xf32>
          %47 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
          %48 = vector.transfer_read %0[%c0, %30, %47, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %49 = vector.extract_strided_slice %48 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %50 = vector.extract %49[0] : vector<1xf32>
          %51 = splat %50 : vector<4xf32>
          %52 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %53 = vector.fma %51, %27, %52 : vector<4xf32>
          %54 = vector.extract_strided_slice %48 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %55 = vector.extract %54[0] : vector<1xf32>
          %56 = splat %55 : vector<4xf32>
          %57 = vector.fma %56, %28, %53 : vector<4xf32>
          %58 = vector.extract_strided_slice %48 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %59 = vector.extract %58[0] : vector<1xf32>
          %60 = splat %59 : vector<4xf32>
          %61 = vector.fma %60, %29, %57 : vector<4xf32>
          %62 = vector.shape_cast %61 : vector<4xf32> to vector<1x4xf32>
          %63 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
          %64 = vector.transfer_read %0[%c0, %30, %63, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %65 = vector.extract_strided_slice %64 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %66 = vector.extract %65[0] : vector<1xf32>
          %67 = splat %66 : vector<4xf32>
          %68 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %69 = vector.fma %67, %27, %68 : vector<4xf32>
          %70 = vector.extract_strided_slice %64 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %71 = vector.extract %70[0] : vector<1xf32>
          %72 = splat %71 : vector<4xf32>
          %73 = vector.fma %72, %28, %69 : vector<4xf32>
          %74 = vector.extract_strided_slice %64 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %75 = vector.extract %74[0] : vector<1xf32>
          %76 = splat %75 : vector<4xf32>
          %77 = vector.fma %76, %29, %73 : vector<4xf32>
          %78 = vector.shape_cast %77 : vector<4xf32> to vector<1x4xf32>
          %79 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
          %80 = vector.transfer_read %0[%c0, %30, %79, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
          %81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %82 = vector.extract %81[0] : vector<1xf32>
          %83 = splat %82 : vector<4xf32>
          %84 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %85 = vector.fma %83, %27, %84 : vector<4xf32>
          %86 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %87 = vector.extract %86[0] : vector<1xf32>
          %88 = splat %87 : vector<4xf32>
          %89 = vector.fma %88, %28, %85 : vector<4xf32>
          %90 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %91 = vector.extract %90[0] : vector<1xf32>
          %92 = splat %91 : vector<4xf32>
          %93 = vector.fma %92, %29, %89 : vector<4xf32>
          %94 = vector.shape_cast %93 : vector<4xf32> to vector<1x4xf32>
          scf.yield %46, %62, %78, %94 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %25#0, %25#1, %25#2, %25#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      %15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
      %18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      vector.transfer_write %15, %2[%c0, %16, %17, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %19 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
      %20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
      vector.transfer_write %19, %2[%c0, %16, %20, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %21 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
      %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
      vector.transfer_write %21, %2[%c0, %16, %22, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
      %23 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
      %24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
      vector.transfer_write %23, %2[%c0, %16, %24, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After SPIRVVectorizeLoadStore //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<1x4xf32>
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c16 = constant 16 : index
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c8 = constant 8 : index
    %cst_0 = constant dense<0.000000e+00> : vector<3xf32>
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %29:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %31 = divi_signed %30, %c4 : index
          %32 = memref.load %1[%arg1, %arg6, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
          %33 = divi_signed %30, %c4 : index
          %34 = memref.load %1[%arg1, %arg6, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
          %35 = divi_signed %30, %c4 : index
          %36 = memref.load %1[%arg1, %arg6, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
          %37 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
          %39 = memref.load %0[%c0, %37, %38, %c0] : memref<1x225x225x3xf32>
          %40 = vector.insert %39, %cst_0 [0] : f32 into vector<3xf32>
          %41 = memref.load %0[%c0, %37, %38, %c1] : memref<1x225x225x3xf32>
          %42 = vector.insert %41, %40 [1] : f32 into vector<3xf32>
          %43 = memref.load %0[%c0, %37, %38, %c2] : memref<1x225x225x3xf32>
          %44 = vector.insert %43, %42 [2] : f32 into vector<3xf32>
          %45 = vector.extract_strided_slice %44 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %46 = vector.extract %45[0] : vector<1xf32>
          %47 = splat %46 : vector<4xf32>
          %48 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
          %49 = vector.fma %47, %32, %48 : vector<4xf32>
          %50 = vector.extract_strided_slice %44 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %51 = vector.extract %50[0] : vector<1xf32>
          %52 = splat %51 : vector<4xf32>
          %53 = vector.fma %52, %34, %49 : vector<4xf32>
          %54 = vector.extract_strided_slice %44 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %55 = vector.extract %54[0] : vector<1xf32>
          %56 = splat %55 : vector<4xf32>
          %57 = vector.fma %56, %36, %53 : vector<4xf32>
          %58 = vector.shape_cast %57 : vector<4xf32> to vector<1x4xf32>
          %59 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
          %60 = memref.load %0[%c0, %37, %59, %c0] : memref<1x225x225x3xf32>
          %61 = vector.insert %60, %cst_0 [0] : f32 into vector<3xf32>
          %62 = memref.load %0[%c0, %37, %59, %c1] : memref<1x225x225x3xf32>
          %63 = vector.insert %62, %61 [1] : f32 into vector<3xf32>
          %64 = memref.load %0[%c0, %37, %59, %c2] : memref<1x225x225x3xf32>
          %65 = vector.insert %64, %63 [2] : f32 into vector<3xf32>
          %66 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %67 = vector.extract %66[0] : vector<1xf32>
          %68 = splat %67 : vector<4xf32>
          %69 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
          %70 = vector.fma %68, %32, %69 : vector<4xf32>
          %71 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %72 = vector.extract %71[0] : vector<1xf32>
          %73 = splat %72 : vector<4xf32>
          %74 = vector.fma %73, %34, %70 : vector<4xf32>
          %75 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %76 = vector.extract %75[0] : vector<1xf32>
          %77 = splat %76 : vector<4xf32>
          %78 = vector.fma %77, %36, %74 : vector<4xf32>
          %79 = vector.shape_cast %78 : vector<4xf32> to vector<1x4xf32>
          %80 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
          %81 = memref.load %0[%c0, %37, %80, %c0] : memref<1x225x225x3xf32>
          %82 = vector.insert %81, %cst_0 [0] : f32 into vector<3xf32>
          %83 = memref.load %0[%c0, %37, %80, %c1] : memref<1x225x225x3xf32>
          %84 = vector.insert %83, %82 [1] : f32 into vector<3xf32>
          %85 = memref.load %0[%c0, %37, %80, %c2] : memref<1x225x225x3xf32>
          %86 = vector.insert %85, %84 [2] : f32 into vector<3xf32>
          %87 = vector.extract_strided_slice %86 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %88 = vector.extract %87[0] : vector<1xf32>
          %89 = splat %88 : vector<4xf32>
          %90 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
          %91 = vector.fma %89, %32, %90 : vector<4xf32>
          %92 = vector.extract_strided_slice %86 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %93 = vector.extract %92[0] : vector<1xf32>
          %94 = splat %93 : vector<4xf32>
          %95 = vector.fma %94, %34, %91 : vector<4xf32>
          %96 = vector.extract_strided_slice %86 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %97 = vector.extract %96[0] : vector<1xf32>
          %98 = splat %97 : vector<4xf32>
          %99 = vector.fma %98, %36, %95 : vector<4xf32>
          %100 = vector.shape_cast %99 : vector<4xf32> to vector<1x4xf32>
          %101 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
          %102 = memref.load %0[%c0, %37, %101, %c0] : memref<1x225x225x3xf32>
          %103 = vector.insert %102, %cst_0 [0] : f32 into vector<3xf32>
          %104 = memref.load %0[%c0, %37, %101, %c1] : memref<1x225x225x3xf32>
          %105 = vector.insert %104, %103 [1] : f32 into vector<3xf32>
          %106 = memref.load %0[%c0, %37, %101, %c2] : memref<1x225x225x3xf32>
          %107 = vector.insert %106, %105 [2] : f32 into vector<3xf32>
          %108 = vector.extract_strided_slice %107 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %109 = vector.extract %108[0] : vector<1xf32>
          %110 = splat %109 : vector<4xf32>
          %111 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
          %112 = vector.fma %110, %32, %111 : vector<4xf32>
          %113 = vector.extract_strided_slice %107 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %114 = vector.extract %113[0] : vector<1xf32>
          %115 = splat %114 : vector<4xf32>
          %116 = vector.fma %115, %34, %112 : vector<4xf32>
          %117 = vector.extract_strided_slice %107 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %118 = vector.extract %117[0] : vector<1xf32>
          %119 = splat %118 : vector<4xf32>
          %120 = vector.fma %119, %36, %116 : vector<4xf32>
          %121 = vector.shape_cast %120 : vector<4xf32> to vector<1x4xf32>
          scf.yield %58, %79, %100, %121 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %29#0, %29#1, %29#2, %29#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      %15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
      %18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      %19 = divi_signed %18, %c4 : index
      memref.store %15, %2[%c0, %16, %17, %19] : memref<1x112x112x8xvector<4xf32>>
      %20 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
      %21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
      %22 = divi_signed %18, %c4 : index
      memref.store %20, %2[%c0, %16, %21, %22] : memref<1x112x112x8xvector<4xf32>>
      %23 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
      %24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
      %25 = divi_signed %18, %c4 : index
      memref.store %23, %2[%c0, %16, %24, %25] : memref<1x112x112x8xvector<4xf32>>
      %26 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
      %27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
      %28 = divi_signed %18, %c4 : index
      memref.store %26, %2[%c0, %16, %27, %28] : memref<1x112x112x8xvector<4xf32>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After SPIRVVectorToCooperativeMatrix //----- //
func @conv_dispatch_0() {
  %cst = constant dense<0.000000e+00> : vector<1x4xf32>
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c112 = constant 112 : index
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c16 = constant 16 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c8 = constant 8 : index
  %cst_0 = constant dense<0.000000e+00> : vector<3xf32>
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = muli %workgroup_id_y, %c8 : index
  %4 = muli %workgroup_id_x, %c32 : index
  %5 = muli %workgroup_id_y, %c16 : index
  %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %9 = muli %7, %c4 : index
  %10 = muli %6, %c4 : index
  %11 = muli %8, %c2 : index
  %12 = muli %7, %c8 : index
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %13 = muli %arg0, %c2 : index
    %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %29:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
        %31 = divi_signed %30, %c4 : index
        %32 = memref.load %1[%arg1, %arg6, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
        %33 = divi_signed %30, %c4 : index
        %34 = memref.load %1[%arg1, %arg6, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
        %35 = divi_signed %30, %c4 : index
        %36 = memref.load %1[%arg1, %arg6, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
        %37 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
        %38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
        %39 = memref.load %0[%c0, %37, %38, %c0] : memref<1x225x225x3xf32>
        %40 = vector.insert %39, %cst_0 [0] : f32 into vector<3xf32>
        %41 = memref.load %0[%c0, %37, %38, %c1] : memref<1x225x225x3xf32>
        %42 = vector.insert %41, %40 [1] : f32 into vector<3xf32>
        %43 = memref.load %0[%c0, %37, %38, %c2] : memref<1x225x225x3xf32>
        %44 = vector.insert %43, %42 [2] : f32 into vector<3xf32>
        %45 = vector.extract_strided_slice %44 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %46 = vector.extract %45[0] : vector<1xf32>
        %47 = splat %46 : vector<4xf32>
        %48 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %49 = vector.fma %47, %32, %48 : vector<4xf32>
        %50 = vector.extract_strided_slice %44 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %51 = vector.extract %50[0] : vector<1xf32>
        %52 = splat %51 : vector<4xf32>
        %53 = vector.fma %52, %34, %49 : vector<4xf32>
        %54 = vector.extract_strided_slice %44 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %55 = vector.extract %54[0] : vector<1xf32>
        %56 = splat %55 : vector<4xf32>
        %57 = vector.fma %56, %36, %53 : vector<4xf32>
        %58 = vector.shape_cast %57 : vector<4xf32> to vector<1x4xf32>
        %59 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
        %60 = memref.load %0[%c0, %37, %59, %c0] : memref<1x225x225x3xf32>
        %61 = vector.insert %60, %cst_0 [0] : f32 into vector<3xf32>
        %62 = memref.load %0[%c0, %37, %59, %c1] : memref<1x225x225x3xf32>
        %63 = vector.insert %62, %61 [1] : f32 into vector<3xf32>
        %64 = memref.load %0[%c0, %37, %59, %c2] : memref<1x225x225x3xf32>
        %65 = vector.insert %64, %63 [2] : f32 into vector<3xf32>
        %66 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %67 = vector.extract %66[0] : vector<1xf32>
        %68 = splat %67 : vector<4xf32>
        %69 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %70 = vector.fma %68, %32, %69 : vector<4xf32>
        %71 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %72 = vector.extract %71[0] : vector<1xf32>
        %73 = splat %72 : vector<4xf32>
        %74 = vector.fma %73, %34, %70 : vector<4xf32>
        %75 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %76 = vector.extract %75[0] : vector<1xf32>
        %77 = splat %76 : vector<4xf32>
        %78 = vector.fma %77, %36, %74 : vector<4xf32>
        %79 = vector.shape_cast %78 : vector<4xf32> to vector<1x4xf32>
        %80 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
        %81 = memref.load %0[%c0, %37, %80, %c0] : memref<1x225x225x3xf32>
        %82 = vector.insert %81, %cst_0 [0] : f32 into vector<3xf32>
        %83 = memref.load %0[%c0, %37, %80, %c1] : memref<1x225x225x3xf32>
        %84 = vector.insert %83, %82 [1] : f32 into vector<3xf32>
        %85 = memref.load %0[%c0, %37, %80, %c2] : memref<1x225x225x3xf32>
        %86 = vector.insert %85, %84 [2] : f32 into vector<3xf32>
        %87 = vector.extract_strided_slice %86 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %88 = vector.extract %87[0] : vector<1xf32>
        %89 = splat %88 : vector<4xf32>
        %90 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %91 = vector.fma %89, %32, %90 : vector<4xf32>
        %92 = vector.extract_strided_slice %86 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %93 = vector.extract %92[0] : vector<1xf32>
        %94 = splat %93 : vector<4xf32>
        %95 = vector.fma %94, %34, %91 : vector<4xf32>
        %96 = vector.extract_strided_slice %86 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %97 = vector.extract %96[0] : vector<1xf32>
        %98 = splat %97 : vector<4xf32>
        %99 = vector.fma %98, %36, %95 : vector<4xf32>
        %100 = vector.shape_cast %99 : vector<4xf32> to vector<1x4xf32>
        %101 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
        %102 = memref.load %0[%c0, %37, %101, %c0] : memref<1x225x225x3xf32>
        %103 = vector.insert %102, %cst_0 [0] : f32 into vector<3xf32>
        %104 = memref.load %0[%c0, %37, %101, %c1] : memref<1x225x225x3xf32>
        %105 = vector.insert %104, %103 [1] : f32 into vector<3xf32>
        %106 = memref.load %0[%c0, %37, %101, %c2] : memref<1x225x225x3xf32>
        %107 = vector.insert %106, %105 [2] : f32 into vector<3xf32>
        %108 = vector.extract_strided_slice %107 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %109 = vector.extract %108[0] : vector<1xf32>
        %110 = splat %109 : vector<4xf32>
        %111 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
        %112 = vector.fma %110, %32, %111 : vector<4xf32>
        %113 = vector.extract_strided_slice %107 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %114 = vector.extract %113[0] : vector<1xf32>
        %115 = splat %114 : vector<4xf32>
        %116 = vector.fma %115, %34, %112 : vector<4xf32>
        %117 = vector.extract_strided_slice %107 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %118 = vector.extract %117[0] : vector<1xf32>
        %119 = splat %118 : vector<4xf32>
        %120 = vector.fma %119, %36, %116 : vector<4xf32>
        %121 = vector.shape_cast %120 : vector<4xf32> to vector<1x4xf32>
        scf.yield %58, %79, %100, %121 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %29#0, %29#1, %29#2, %29#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    %15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
    %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
    %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
    %18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
    %19 = divi_signed %18, %c4 : index
    memref.store %15, %2[%c0, %16, %17, %19] : memref<1x112x112x8xvector<4xf32>>
    %20 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
    %21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
    %22 = divi_signed %18, %c4 : index
    memref.store %20, %2[%c0, %16, %21, %22] : memref<1x112x112x8xvector<4xf32>>
    %23 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
    %24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
    %25 = divi_signed %18, %c4 : index
    memref.store %23, %2[%c0, %16, %24, %25] : memref<1x112x112x8xvector<4xf32>>
    %26 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
    %27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
    %28 = divi_signed %18, %c4 : index
    memref.store %26, %2[%c0, %16, %27, %28] : memref<1x112x112x8xvector<4xf32>>
  }
  return
}

// -----// IR Dump After ForOpCanonicalization //----- //
func @conv_dispatch_0() {
  %cst = constant dense<0.000000e+00> : vector<1x4xf32>
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c112 = constant 112 : index
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c16 = constant 16 : index
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c8 = constant 8 : index
  %cst_0 = constant dense<0.000000e+00> : vector<3xf32>
  %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
  %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = muli %workgroup_id_y, %c8 : index
  %4 = muli %workgroup_id_x, %c32 : index
  %5 = muli %workgroup_id_y, %c16 : index
  %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %9 = muli %7, %c4 : index
  %10 = muli %6, %c4 : index
  %11 = muli %8, %c2 : index
  %12 = muli %7, %c8 : index
  scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
    %13 = muli %arg0, %c2 : index
    %14 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
    %15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
    %16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
    %17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
    %18:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %14, %arg3 = %15, %arg4 = %16, %arg5 = %17) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
      %29:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
        %31 = divi_signed %30, %c4 : index
        %32 = memref.load %1[%arg1, %arg6, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
        %33 = divi_signed %30, %c4 : index
        %34 = memref.load %1[%arg1, %arg6, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
        %35 = divi_signed %30, %c4 : index
        %36 = memref.load %1[%arg1, %arg6, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
        %37 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
        %38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
        %39 = memref.load %0[%c0, %37, %38, %c0] : memref<1x225x225x3xf32>
        %40 = vector.insert %39, %cst_0 [0] : f32 into vector<3xf32>
        %41 = memref.load %0[%c0, %37, %38, %c1] : memref<1x225x225x3xf32>
        %42 = vector.insert %41, %40 [1] : f32 into vector<3xf32>
        %43 = memref.load %0[%c0, %37, %38, %c2] : memref<1x225x225x3xf32>
        %44 = vector.insert %43, %42 [2] : f32 into vector<3xf32>
        %45 = vector.extract_strided_slice %44 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %46 = vector.extract %45[0] : vector<1xf32>
        %47 = splat %46 : vector<4xf32>
        %48 = vector.fma %47, %32, %arg7 : vector<4xf32>
        %49 = vector.extract_strided_slice %44 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %50 = vector.extract %49[0] : vector<1xf32>
        %51 = splat %50 : vector<4xf32>
        %52 = vector.fma %51, %34, %48 : vector<4xf32>
        %53 = vector.extract_strided_slice %44 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %54 = vector.extract %53[0] : vector<1xf32>
        %55 = splat %54 : vector<4xf32>
        %56 = vector.fma %55, %36, %52 : vector<4xf32>
        %57 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
        %58 = memref.load %0[%c0, %37, %57, %c0] : memref<1x225x225x3xf32>
        %59 = vector.insert %58, %cst_0 [0] : f32 into vector<3xf32>
        %60 = memref.load %0[%c0, %37, %57, %c1] : memref<1x225x225x3xf32>
        %61 = vector.insert %60, %59 [1] : f32 into vector<3xf32>
        %62 = memref.load %0[%c0, %37, %57, %c2] : memref<1x225x225x3xf32>
        %63 = vector.insert %62, %61 [2] : f32 into vector<3xf32>
        %64 = vector.extract_strided_slice %63 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %65 = vector.extract %64[0] : vector<1xf32>
        %66 = splat %65 : vector<4xf32>
        %67 = vector.fma %66, %32, %arg8 : vector<4xf32>
        %68 = vector.extract_strided_slice %63 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %69 = vector.extract %68[0] : vector<1xf32>
        %70 = splat %69 : vector<4xf32>
        %71 = vector.fma %70, %34, %67 : vector<4xf32>
        %72 = vector.extract_strided_slice %63 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %73 = vector.extract %72[0] : vector<1xf32>
        %74 = splat %73 : vector<4xf32>
        %75 = vector.fma %74, %36, %71 : vector<4xf32>
        %76 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
        %77 = memref.load %0[%c0, %37, %76, %c0] : memref<1x225x225x3xf32>
        %78 = vector.insert %77, %cst_0 [0] : f32 into vector<3xf32>
        %79 = memref.load %0[%c0, %37, %76, %c1] : memref<1x225x225x3xf32>
        %80 = vector.insert %79, %78 [1] : f32 into vector<3xf32>
        %81 = memref.load %0[%c0, %37, %76, %c2] : memref<1x225x225x3xf32>
        %82 = vector.insert %81, %80 [2] : f32 into vector<3xf32>
        %83 = vector.extract_strided_slice %82 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %84 = vector.extract %83[0] : vector<1xf32>
        %85 = splat %84 : vector<4xf32>
        %86 = vector.fma %85, %32, %arg9 : vector<4xf32>
        %87 = vector.extract_strided_slice %82 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %88 = vector.extract %87[0] : vector<1xf32>
        %89 = splat %88 : vector<4xf32>
        %90 = vector.fma %89, %34, %86 : vector<4xf32>
        %91 = vector.extract_strided_slice %82 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %92 = vector.extract %91[0] : vector<1xf32>
        %93 = splat %92 : vector<4xf32>
        %94 = vector.fma %93, %36, %90 : vector<4xf32>
        %95 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
        %96 = memref.load %0[%c0, %37, %95, %c0] : memref<1x225x225x3xf32>
        %97 = vector.insert %96, %cst_0 [0] : f32 into vector<3xf32>
        %98 = memref.load %0[%c0, %37, %95, %c1] : memref<1x225x225x3xf32>
        %99 = vector.insert %98, %97 [1] : f32 into vector<3xf32>
        %100 = memref.load %0[%c0, %37, %95, %c2] : memref<1x225x225x3xf32>
        %101 = vector.insert %100, %99 [2] : f32 into vector<3xf32>
        %102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %103 = vector.extract %102[0] : vector<1xf32>
        %104 = splat %103 : vector<4xf32>
        %105 = vector.fma %104, %32, %arg10 : vector<4xf32>
        %106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %107 = vector.extract %106[0] : vector<1xf32>
        %108 = splat %107 : vector<4xf32>
        %109 = vector.fma %108, %34, %105 : vector<4xf32>
        %110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
        %111 = vector.extract %110[0] : vector<1xf32>
        %112 = splat %111 : vector<4xf32>
        %113 = vector.fma %112, %36, %109 : vector<4xf32>
        scf.yield %56, %75, %94, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      scf.yield %29#0, %29#1, %29#2, %29#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
    }
    %19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
    %20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
    %21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
    %22 = divi_signed %21, %c4 : index
    memref.store %18#3, %2[%c0, %19, %20, %22] : memref<1x112x112x8xvector<4xf32>>
    %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
    %24 = divi_signed %21, %c4 : index
    memref.store %18#2, %2[%c0, %19, %23, %24] : memref<1x112x112x8xvector<4xf32>>
    %25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
    %26 = divi_signed %21, %c4 : index
    memref.store %18#1, %2[%c0, %19, %25, %26] : memref<1x112x112x8xvector<4xf32>>
    %27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
    %28 = divi_signed %21, %c4 : index
    memref.store %18#0, %2[%c0, %19, %27, %28] : memref<1x112x112x8xvector<4xf32>>
  }
  return
}

// -----// IR Dump After Canonicalizer //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<4xf32>
    %cst_0 = constant dense<0.000000e+00> : vector<3xf32>
    %c8 = constant 8 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c16 = constant 16 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c112 = constant 112 : index
    %c3 = constant 3 : index
    %c1 = constant 1 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %25:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
          %26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %27 = divi_signed %26, %c4 : index
          %28 = memref.load %1[%arg1, %arg6, %c0, %27] : memref<3x3x3x8xvector<4xf32>>
          %29 = divi_signed %26, %c4 : index
          %30 = memref.load %1[%arg1, %arg6, %c1, %29] : memref<3x3x3x8xvector<4xf32>>
          %31 = divi_signed %26, %c4 : index
          %32 = memref.load %1[%arg1, %arg6, %c2, %31] : memref<3x3x3x8xvector<4xf32>>
          %33 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %34 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
          %35 = memref.load %0[%c0, %33, %34, %c0] : memref<1x225x225x3xf32>
          %36 = vector.insert %35, %cst_0 [0] : f32 into vector<3xf32>
          %37 = memref.load %0[%c0, %33, %34, %c1] : memref<1x225x225x3xf32>
          %38 = vector.insert %37, %36 [1] : f32 into vector<3xf32>
          %39 = memref.load %0[%c0, %33, %34, %c2] : memref<1x225x225x3xf32>
          %40 = vector.insert %39, %38 [2] : f32 into vector<3xf32>
          %41 = vector.extract_strided_slice %40 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %42 = vector.extract %41[0] : vector<1xf32>
          %43 = splat %42 : vector<4xf32>
          %44 = vector.fma %43, %28, %arg7 : vector<4xf32>
          %45 = vector.extract_strided_slice %40 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %46 = vector.extract %45[0] : vector<1xf32>
          %47 = splat %46 : vector<4xf32>
          %48 = vector.fma %47, %30, %44 : vector<4xf32>
          %49 = vector.extract_strided_slice %40 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %50 = vector.extract %49[0] : vector<1xf32>
          %51 = splat %50 : vector<4xf32>
          %52 = vector.fma %51, %32, %48 : vector<4xf32>
          %53 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
          %54 = memref.load %0[%c0, %33, %53, %c0] : memref<1x225x225x3xf32>
          %55 = vector.insert %54, %cst_0 [0] : f32 into vector<3xf32>
          %56 = memref.load %0[%c0, %33, %53, %c1] : memref<1x225x225x3xf32>
          %57 = vector.insert %56, %55 [1] : f32 into vector<3xf32>
          %58 = memref.load %0[%c0, %33, %53, %c2] : memref<1x225x225x3xf32>
          %59 = vector.insert %58, %57 [2] : f32 into vector<3xf32>
          %60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %61 = vector.extract %60[0] : vector<1xf32>
          %62 = splat %61 : vector<4xf32>
          %63 = vector.fma %62, %28, %arg8 : vector<4xf32>
          %64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %65 = vector.extract %64[0] : vector<1xf32>
          %66 = splat %65 : vector<4xf32>
          %67 = vector.fma %66, %30, %63 : vector<4xf32>
          %68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %69 = vector.extract %68[0] : vector<1xf32>
          %70 = splat %69 : vector<4xf32>
          %71 = vector.fma %70, %32, %67 : vector<4xf32>
          %72 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
          %73 = memref.load %0[%c0, %33, %72, %c0] : memref<1x225x225x3xf32>
          %74 = vector.insert %73, %cst_0 [0] : f32 into vector<3xf32>
          %75 = memref.load %0[%c0, %33, %72, %c1] : memref<1x225x225x3xf32>
          %76 = vector.insert %75, %74 [1] : f32 into vector<3xf32>
          %77 = memref.load %0[%c0, %33, %72, %c2] : memref<1x225x225x3xf32>
          %78 = vector.insert %77, %76 [2] : f32 into vector<3xf32>
          %79 = vector.extract_strided_slice %78 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %80 = vector.extract %79[0] : vector<1xf32>
          %81 = splat %80 : vector<4xf32>
          %82 = vector.fma %81, %28, %arg9 : vector<4xf32>
          %83 = vector.extract_strided_slice %78 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %84 = vector.extract %83[0] : vector<1xf32>
          %85 = splat %84 : vector<4xf32>
          %86 = vector.fma %85, %30, %82 : vector<4xf32>
          %87 = vector.extract_strided_slice %78 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %88 = vector.extract %87[0] : vector<1xf32>
          %89 = splat %88 : vector<4xf32>
          %90 = vector.fma %89, %32, %86 : vector<4xf32>
          %91 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
          %92 = memref.load %0[%c0, %33, %91, %c0] : memref<1x225x225x3xf32>
          %93 = vector.insert %92, %cst_0 [0] : f32 into vector<3xf32>
          %94 = memref.load %0[%c0, %33, %91, %c1] : memref<1x225x225x3xf32>
          %95 = vector.insert %94, %93 [1] : f32 into vector<3xf32>
          %96 = memref.load %0[%c0, %33, %91, %c2] : memref<1x225x225x3xf32>
          %97 = vector.insert %96, %95 [2] : f32 into vector<3xf32>
          %98 = vector.extract_strided_slice %97 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %99 = vector.extract %98[0] : vector<1xf32>
          %100 = splat %99 : vector<4xf32>
          %101 = vector.fma %100, %28, %arg10 : vector<4xf32>
          %102 = vector.extract_strided_slice %97 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %103 = vector.extract %102[0] : vector<1xf32>
          %104 = splat %103 : vector<4xf32>
          %105 = vector.fma %104, %30, %101 : vector<4xf32>
          %106 = vector.extract_strided_slice %97 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %107 = vector.extract %106[0] : vector<1xf32>
          %108 = splat %107 : vector<4xf32>
          %109 = vector.fma %108, %32, %105 : vector<4xf32>
          scf.yield %52, %71, %90, %109 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
        }
        scf.yield %25#0, %25#1, %25#2, %25#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
      %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      %18 = divi_signed %17, %c4 : index
      memref.store %14#3, %2[%c0, %15, %16, %18] : memref<1x112x112x8xvector<4xf32>>
      %19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
      %20 = divi_signed %17, %c4 : index
      memref.store %14#2, %2[%c0, %15, %19, %20] : memref<1x112x112x8xvector<4xf32>>
      %21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
      %22 = divi_signed %17, %c4 : index
      memref.store %14#1, %2[%c0, %15, %21, %22] : memref<1x112x112x8xvector<4xf32>>
      %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
      %24 = divi_signed %17, %c4 : index
      memref.store %14#0, %2[%c0, %15, %23, %24] : memref<1x112x112x8xvector<4xf32>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After CSE //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<4xf32>
    %cst_0 = constant dense<0.000000e+00> : vector<3xf32>
    %c8 = constant 8 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c16 = constant 16 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c112 = constant 112 : index
    %c3 = constant 3 : index
    %c1 = constant 1 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %22:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
          %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %24 = divi_signed %23, %c4 : index
          %25 = memref.load %1[%arg1, %arg6, %c0, %24] : memref<3x3x3x8xvector<4xf32>>
          %26 = memref.load %1[%arg1, %arg6, %c1, %24] : memref<3x3x3x8xvector<4xf32>>
          %27 = memref.load %1[%arg1, %arg6, %c2, %24] : memref<3x3x3x8xvector<4xf32>>
          %28 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
          %29 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
          %30 = memref.load %0[%c0, %28, %29, %c0] : memref<1x225x225x3xf32>
          %31 = vector.insert %30, %cst_0 [0] : f32 into vector<3xf32>
          %32 = memref.load %0[%c0, %28, %29, %c1] : memref<1x225x225x3xf32>
          %33 = vector.insert %32, %31 [1] : f32 into vector<3xf32>
          %34 = memref.load %0[%c0, %28, %29, %c2] : memref<1x225x225x3xf32>
          %35 = vector.insert %34, %33 [2] : f32 into vector<3xf32>
          %36 = vector.extract_strided_slice %35 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %37 = vector.extract %36[0] : vector<1xf32>
          %38 = splat %37 : vector<4xf32>
          %39 = vector.fma %38, %25, %arg7 : vector<4xf32>
          %40 = vector.extract_strided_slice %35 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %41 = vector.extract %40[0] : vector<1xf32>
          %42 = splat %41 : vector<4xf32>
          %43 = vector.fma %42, %26, %39 : vector<4xf32>
          %44 = vector.extract_strided_slice %35 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %45 = vector.extract %44[0] : vector<1xf32>
          %46 = splat %45 : vector<4xf32>
          %47 = vector.fma %46, %27, %43 : vector<4xf32>
          %48 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
          %49 = memref.load %0[%c0, %28, %48, %c0] : memref<1x225x225x3xf32>
          %50 = vector.insert %49, %cst_0 [0] : f32 into vector<3xf32>
          %51 = memref.load %0[%c0, %28, %48, %c1] : memref<1x225x225x3xf32>
          %52 = vector.insert %51, %50 [1] : f32 into vector<3xf32>
          %53 = memref.load %0[%c0, %28, %48, %c2] : memref<1x225x225x3xf32>
          %54 = vector.insert %53, %52 [2] : f32 into vector<3xf32>
          %55 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %56 = vector.extract %55[0] : vector<1xf32>
          %57 = splat %56 : vector<4xf32>
          %58 = vector.fma %57, %25, %arg8 : vector<4xf32>
          %59 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %60 = vector.extract %59[0] : vector<1xf32>
          %61 = splat %60 : vector<4xf32>
          %62 = vector.fma %61, %26, %58 : vector<4xf32>
          %63 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %64 = vector.extract %63[0] : vector<1xf32>
          %65 = splat %64 : vector<4xf32>
          %66 = vector.fma %65, %27, %62 : vector<4xf32>
          %67 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
          %68 = memref.load %0[%c0, %28, %67, %c0] : memref<1x225x225x3xf32>
          %69 = vector.insert %68, %cst_0 [0] : f32 into vector<3xf32>
          %70 = memref.load %0[%c0, %28, %67, %c1] : memref<1x225x225x3xf32>
          %71 = vector.insert %70, %69 [1] : f32 into vector<3xf32>
          %72 = memref.load %0[%c0, %28, %67, %c2] : memref<1x225x225x3xf32>
          %73 = vector.insert %72, %71 [2] : f32 into vector<3xf32>
          %74 = vector.extract_strided_slice %73 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %75 = vector.extract %74[0] : vector<1xf32>
          %76 = splat %75 : vector<4xf32>
          %77 = vector.fma %76, %25, %arg9 : vector<4xf32>
          %78 = vector.extract_strided_slice %73 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %79 = vector.extract %78[0] : vector<1xf32>
          %80 = splat %79 : vector<4xf32>
          %81 = vector.fma %80, %26, %77 : vector<4xf32>
          %82 = vector.extract_strided_slice %73 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %83 = vector.extract %82[0] : vector<1xf32>
          %84 = splat %83 : vector<4xf32>
          %85 = vector.fma %84, %27, %81 : vector<4xf32>
          %86 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
          %87 = memref.load %0[%c0, %28, %86, %c0] : memref<1x225x225x3xf32>
          %88 = vector.insert %87, %cst_0 [0] : f32 into vector<3xf32>
          %89 = memref.load %0[%c0, %28, %86, %c1] : memref<1x225x225x3xf32>
          %90 = vector.insert %89, %88 [1] : f32 into vector<3xf32>
          %91 = memref.load %0[%c0, %28, %86, %c2] : memref<1x225x225x3xf32>
          %92 = vector.insert %91, %90 [2] : f32 into vector<3xf32>
          %93 = vector.extract_strided_slice %92 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %94 = vector.extract %93[0] : vector<1xf32>
          %95 = splat %94 : vector<4xf32>
          %96 = vector.fma %95, %25, %arg10 : vector<4xf32>
          %97 = vector.extract_strided_slice %92 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %98 = vector.extract %97[0] : vector<1xf32>
          %99 = splat %98 : vector<4xf32>
          %100 = vector.fma %99, %26, %96 : vector<4xf32>
          %101 = vector.extract_strided_slice %92 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %102 = vector.extract %101[0] : vector<1xf32>
          %103 = splat %102 : vector<4xf32>
          %104 = vector.fma %103, %27, %100 : vector<4xf32>
          scf.yield %47, %66, %85, %104 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
        }
        scf.yield %22#0, %22#1, %22#2, %22#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
      %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      %18 = divi_signed %17, %c4 : index
      memref.store %14#3, %2[%c0, %15, %16, %18] : memref<1x112x112x8xvector<4xf32>>
      %19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
      memref.store %14#2, %2[%c0, %15, %19, %18] : memref<1x112x112x8xvector<4xf32>>
      %20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
      memref.store %14#1, %2[%c0, %15, %20, %18] : memref<1x112x112x8xvector<4xf32>>
      %21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
      memref.store %14#0, %2[%c0, %15, %21, %18] : memref<1x112x112x8xvector<4xf32>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After FlattenMemRefSubspan //----- //
module  {
  func @conv_dispatch_0() {
    %cst = constant dense<0.000000e+00> : vector<4xf32>
    %cst_0 = constant dense<0.000000e+00> : vector<3xf32>
    %c8 = constant 8 : index
    %c2 = constant 2 : index
    %c4 = constant 4 : index
    %c16 = constant 16 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c112 = constant 112 : index
    %c3 = constant 3 : index
    %c1 = constant 1 : index
    %c151875 = constant 151875 : index
    %c216 = constant 216 : index
    %c100352 = constant 100352 : index
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %21:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
          %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %23 = divi_signed %22, %c4 : index
          %24 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2)>(%arg1, %arg6, %23)
          %25 = memref.load %1[%24] : memref<?xvector<4xf32>>
          %26 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 8)>(%arg1, %arg6, %23)
          %27 = memref.load %1[%26] : memref<?xvector<4xf32>>
          %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 16)>(%arg1, %arg6, %23)
          %29 = memref.load %1[%28] : memref<?xvector<4xf32>>
          %30 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3)>(%arg1, %arg6)[%13, %11, %5, %12]
          %31 = memref.load %0[%30] : memref<?xf32>
          %32 = vector.insert %31, %cst_0 [0] : f32 into vector<3xf32>
          %33 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 1)>(%arg1, %arg6)[%13, %11, %5, %12]
          %34 = memref.load %0[%33] : memref<?xf32>
          %35 = vector.insert %34, %32 [1] : f32 into vector<3xf32>
          %36 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 2)>(%arg1, %arg6)[%13, %11, %5, %12]
          %37 = memref.load %0[%36] : memref<?xf32>
          %38 = vector.insert %37, %35 [2] : f32 into vector<3xf32>
          %39 = vector.extract_strided_slice %38 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %40 = vector.extract %39[0] : vector<1xf32>
          %41 = splat %40 : vector<4xf32>
          %42 = vector.fma %41, %25, %arg7 : vector<4xf32>
          %43 = vector.extract_strided_slice %38 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %44 = vector.extract %43[0] : vector<1xf32>
          %45 = splat %44 : vector<4xf32>
          %46 = vector.fma %45, %27, %42 : vector<4xf32>
          %47 = vector.extract_strided_slice %38 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %48 = vector.extract %47[0] : vector<1xf32>
          %49 = splat %48 : vector<4xf32>
          %50 = vector.fma %49, %29, %46 : vector<4xf32>
          %51 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 6)>(%arg1)[%13, %11, %5, %12, %arg6]
          %52 = memref.load %0[%51] : memref<?xf32>
          %53 = vector.insert %52, %cst_0 [0] : f32 into vector<3xf32>
          %54 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 7)>(%arg1)[%13, %11, %5, %12, %arg6]
          %55 = memref.load %0[%54] : memref<?xf32>
          %56 = vector.insert %55, %53 [1] : f32 into vector<3xf32>
          %57 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 8)>(%arg1)[%13, %11, %5, %12, %arg6]
          %58 = memref.load %0[%57] : memref<?xf32>
          %59 = vector.insert %58, %56 [2] : f32 into vector<3xf32>
          %60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %61 = vector.extract %60[0] : vector<1xf32>
          %62 = splat %61 : vector<4xf32>
          %63 = vector.fma %62, %25, %arg8 : vector<4xf32>
          %64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %65 = vector.extract %64[0] : vector<1xf32>
          %66 = splat %65 : vector<4xf32>
          %67 = vector.fma %66, %27, %63 : vector<4xf32>
          %68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %69 = vector.extract %68[0] : vector<1xf32>
          %70 = splat %69 : vector<4xf32>
          %71 = vector.fma %70, %29, %67 : vector<4xf32>
          %72 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 12)>(%arg1)[%13, %11, %5, %12, %arg6]
          %73 = memref.load %0[%72] : memref<?xf32>
          %74 = vector.insert %73, %cst_0 [0] : f32 into vector<3xf32>
          %75 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 13)>(%arg1)[%13, %11, %5, %12, %arg6]
          %76 = memref.load %0[%75] : memref<?xf32>
          %77 = vector.insert %76, %74 [1] : f32 into vector<3xf32>
          %78 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 14)>(%arg1)[%13, %11, %5, %12, %arg6]
          %79 = memref.load %0[%78] : memref<?xf32>
          %80 = vector.insert %79, %77 [2] : f32 into vector<3xf32>
          %81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %82 = vector.extract %81[0] : vector<1xf32>
          %83 = splat %82 : vector<4xf32>
          %84 = vector.fma %83, %25, %arg9 : vector<4xf32>
          %85 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %86 = vector.extract %85[0] : vector<1xf32>
          %87 = splat %86 : vector<4xf32>
          %88 = vector.fma %87, %27, %84 : vector<4xf32>
          %89 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %90 = vector.extract %89[0] : vector<1xf32>
          %91 = splat %90 : vector<4xf32>
          %92 = vector.fma %91, %29, %88 : vector<4xf32>
          %93 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 18)>(%arg1)[%13, %11, %5, %12, %arg6]
          %94 = memref.load %0[%93] : memref<?xf32>
          %95 = vector.insert %94, %cst_0 [0] : f32 into vector<3xf32>
          %96 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 19)>(%arg1)[%13, %11, %5, %12, %arg6]
          %97 = memref.load %0[%96] : memref<?xf32>
          %98 = vector.insert %97, %95 [1] : f32 into vector<3xf32>
          %99 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 20)>(%arg1)[%13, %11, %5, %12, %arg6]
          %100 = memref.load %0[%99] : memref<?xf32>
          %101 = vector.insert %100, %98 [2] : f32 into vector<3xf32>
          %102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %103 = vector.extract %102[0] : vector<1xf32>
          %104 = splat %103 : vector<4xf32>
          %105 = vector.fma %104, %25, %arg10 : vector<4xf32>
          %106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %107 = vector.extract %106[0] : vector<1xf32>
          %108 = splat %107 : vector<4xf32>
          %109 = vector.fma %108, %27, %105 : vector<4xf32>
          %110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %111 = vector.extract %110[0] : vector<1xf32>
          %112 = splat %111 : vector<4xf32>
          %113 = vector.fma %112, %29, %109 : vector<4xf32>
          scf.yield %50, %71, %92, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
        }
        scf.yield %21#0, %21#1, %21#2, %21#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      %16 = divi_signed %15, %c4 : index
      %17 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 24)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#3, %2[%17] : memref<?xvector<4xf32>>
      %18 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 16)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#2, %2[%18] : memref<?xvector<4xf32>>
      %19 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 8)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#1, %2[%19] : memref<?xvector<4xf32>>
      %20 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#0, %2[%20] : memref<?xvector<4xf32>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After Canonicalizer //----- //
module  {
  func @conv_dispatch_0() {
    %c100352 = constant 100352 : index
    %c216 = constant 216 : index
    %c151875 = constant 151875 : index
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c16 = constant 16 : index
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c8 = constant 8 : index
    %cst = constant dense<0.000000e+00> : vector<3xf32>
    %cst_0 = constant dense<0.000000e+00> : vector<4xf32>
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst_0, %arg3 = %cst_0, %arg4 = %cst_0, %arg5 = %cst_0) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %21:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
          %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %23 = divi_signed %22, %c4 : index
          %24 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2)>(%arg1, %arg6, %23)
          %25 = memref.load %1[%24] : memref<?xvector<4xf32>>
          %26 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 8)>(%arg1, %arg6, %23)
          %27 = memref.load %1[%26] : memref<?xvector<4xf32>>
          %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 16)>(%arg1, %arg6, %23)
          %29 = memref.load %1[%28] : memref<?xvector<4xf32>>
          %30 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3)>(%arg1, %arg6)[%13, %11, %5, %12]
          %31 = memref.load %0[%30] : memref<?xf32>
          %32 = vector.insert %31, %cst [0] : f32 into vector<3xf32>
          %33 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 1)>(%arg1, %arg6)[%13, %11, %5, %12]
          %34 = memref.load %0[%33] : memref<?xf32>
          %35 = vector.insert %34, %32 [1] : f32 into vector<3xf32>
          %36 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 2)>(%arg1, %arg6)[%13, %11, %5, %12]
          %37 = memref.load %0[%36] : memref<?xf32>
          %38 = vector.insert %37, %35 [2] : f32 into vector<3xf32>
          %39 = vector.extract_strided_slice %38 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %40 = vector.extract %39[0] : vector<1xf32>
          %41 = splat %40 : vector<4xf32>
          %42 = vector.fma %41, %25, %arg7 : vector<4xf32>
          %43 = vector.extract_strided_slice %38 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %44 = vector.extract %43[0] : vector<1xf32>
          %45 = splat %44 : vector<4xf32>
          %46 = vector.fma %45, %27, %42 : vector<4xf32>
          %47 = vector.extract_strided_slice %38 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %48 = vector.extract %47[0] : vector<1xf32>
          %49 = splat %48 : vector<4xf32>
          %50 = vector.fma %49, %29, %46 : vector<4xf32>
          %51 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 6)>(%arg1)[%13, %11, %5, %12, %arg6]
          %52 = memref.load %0[%51] : memref<?xf32>
          %53 = vector.insert %52, %cst [0] : f32 into vector<3xf32>
          %54 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 7)>(%arg1)[%13, %11, %5, %12, %arg6]
          %55 = memref.load %0[%54] : memref<?xf32>
          %56 = vector.insert %55, %53 [1] : f32 into vector<3xf32>
          %57 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 8)>(%arg1)[%13, %11, %5, %12, %arg6]
          %58 = memref.load %0[%57] : memref<?xf32>
          %59 = vector.insert %58, %56 [2] : f32 into vector<3xf32>
          %60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %61 = vector.extract %60[0] : vector<1xf32>
          %62 = splat %61 : vector<4xf32>
          %63 = vector.fma %62, %25, %arg8 : vector<4xf32>
          %64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %65 = vector.extract %64[0] : vector<1xf32>
          %66 = splat %65 : vector<4xf32>
          %67 = vector.fma %66, %27, %63 : vector<4xf32>
          %68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %69 = vector.extract %68[0] : vector<1xf32>
          %70 = splat %69 : vector<4xf32>
          %71 = vector.fma %70, %29, %67 : vector<4xf32>
          %72 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 12)>(%arg1)[%13, %11, %5, %12, %arg6]
          %73 = memref.load %0[%72] : memref<?xf32>
          %74 = vector.insert %73, %cst [0] : f32 into vector<3xf32>
          %75 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 13)>(%arg1)[%13, %11, %5, %12, %arg6]
          %76 = memref.load %0[%75] : memref<?xf32>
          %77 = vector.insert %76, %74 [1] : f32 into vector<3xf32>
          %78 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 14)>(%arg1)[%13, %11, %5, %12, %arg6]
          %79 = memref.load %0[%78] : memref<?xf32>
          %80 = vector.insert %79, %77 [2] : f32 into vector<3xf32>
          %81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %82 = vector.extract %81[0] : vector<1xf32>
          %83 = splat %82 : vector<4xf32>
          %84 = vector.fma %83, %25, %arg9 : vector<4xf32>
          %85 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %86 = vector.extract %85[0] : vector<1xf32>
          %87 = splat %86 : vector<4xf32>
          %88 = vector.fma %87, %27, %84 : vector<4xf32>
          %89 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %90 = vector.extract %89[0] : vector<1xf32>
          %91 = splat %90 : vector<4xf32>
          %92 = vector.fma %91, %29, %88 : vector<4xf32>
          %93 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 18)>(%arg1)[%13, %11, %5, %12, %arg6]
          %94 = memref.load %0[%93] : memref<?xf32>
          %95 = vector.insert %94, %cst [0] : f32 into vector<3xf32>
          %96 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 19)>(%arg1)[%13, %11, %5, %12, %arg6]
          %97 = memref.load %0[%96] : memref<?xf32>
          %98 = vector.insert %97, %95 [1] : f32 into vector<3xf32>
          %99 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 20)>(%arg1)[%13, %11, %5, %12, %arg6]
          %100 = memref.load %0[%99] : memref<?xf32>
          %101 = vector.insert %100, %98 [2] : f32 into vector<3xf32>
          %102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %103 = vector.extract %102[0] : vector<1xf32>
          %104 = splat %103 : vector<4xf32>
          %105 = vector.fma %104, %25, %arg10 : vector<4xf32>
          %106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %107 = vector.extract %106[0] : vector<1xf32>
          %108 = splat %107 : vector<4xf32>
          %109 = vector.fma %108, %27, %105 : vector<4xf32>
          %110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %111 = vector.extract %110[0] : vector<1xf32>
          %112 = splat %111 : vector<4xf32>
          %113 = vector.fma %112, %29, %109 : vector<4xf32>
          scf.yield %50, %71, %92, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
        }
        scf.yield %21#0, %21#1, %21#2, %21#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      %16 = divi_signed %15, %c4 : index
      %17 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 24)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#3, %2[%17] : memref<?xvector<4xf32>>
      %18 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 16)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#2, %2[%18] : memref<?xvector<4xf32>>
      %19 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 8)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#1, %2[%19] : memref<?xvector<4xf32>>
      %20 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#0, %2[%20] : memref<?xvector<4xf32>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After CSE //----- //
module  {
  func @conv_dispatch_0() {
    %c100352 = constant 100352 : index
    %c216 = constant 216 : index
    %c151875 = constant 151875 : index
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c16 = constant 16 : index
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c8 = constant 8 : index
    %cst = constant dense<0.000000e+00> : vector<3xf32>
    %cst_0 = constant dense<0.000000e+00> : vector<4xf32>
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst_0, %arg3 = %cst_0, %arg4 = %cst_0, %arg5 = %cst_0) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %21:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
          %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
          %23 = divi_signed %22, %c4 : index
          %24 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2)>(%arg1, %arg6, %23)
          %25 = memref.load %1[%24] : memref<?xvector<4xf32>>
          %26 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 8)>(%arg1, %arg6, %23)
          %27 = memref.load %1[%26] : memref<?xvector<4xf32>>
          %28 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 16)>(%arg1, %arg6, %23)
          %29 = memref.load %1[%28] : memref<?xvector<4xf32>>
          %30 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3)>(%arg1, %arg6)[%13, %11, %5, %12]
          %31 = memref.load %0[%30] : memref<?xf32>
          %32 = vector.insert %31, %cst [0] : f32 into vector<3xf32>
          %33 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 1)>(%arg1, %arg6)[%13, %11, %5, %12]
          %34 = memref.load %0[%33] : memref<?xf32>
          %35 = vector.insert %34, %32 [1] : f32 into vector<3xf32>
          %36 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 2)>(%arg1, %arg6)[%13, %11, %5, %12]
          %37 = memref.load %0[%36] : memref<?xf32>
          %38 = vector.insert %37, %35 [2] : f32 into vector<3xf32>
          %39 = vector.extract_strided_slice %38 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %40 = vector.extract %39[0] : vector<1xf32>
          %41 = splat %40 : vector<4xf32>
          %42 = vector.fma %41, %25, %arg7 : vector<4xf32>
          %43 = vector.extract_strided_slice %38 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %44 = vector.extract %43[0] : vector<1xf32>
          %45 = splat %44 : vector<4xf32>
          %46 = vector.fma %45, %27, %42 : vector<4xf32>
          %47 = vector.extract_strided_slice %38 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %48 = vector.extract %47[0] : vector<1xf32>
          %49 = splat %48 : vector<4xf32>
          %50 = vector.fma %49, %29, %46 : vector<4xf32>
          %51 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 6)>(%arg1)[%13, %11, %5, %12, %arg6]
          %52 = memref.load %0[%51] : memref<?xf32>
          %53 = vector.insert %52, %cst [0] : f32 into vector<3xf32>
          %54 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 7)>(%arg1)[%13, %11, %5, %12, %arg6]
          %55 = memref.load %0[%54] : memref<?xf32>
          %56 = vector.insert %55, %53 [1] : f32 into vector<3xf32>
          %57 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 8)>(%arg1)[%13, %11, %5, %12, %arg6]
          %58 = memref.load %0[%57] : memref<?xf32>
          %59 = vector.insert %58, %56 [2] : f32 into vector<3xf32>
          %60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %61 = vector.extract %60[0] : vector<1xf32>
          %62 = splat %61 : vector<4xf32>
          %63 = vector.fma %62, %25, %arg8 : vector<4xf32>
          %64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %65 = vector.extract %64[0] : vector<1xf32>
          %66 = splat %65 : vector<4xf32>
          %67 = vector.fma %66, %27, %63 : vector<4xf32>
          %68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %69 = vector.extract %68[0] : vector<1xf32>
          %70 = splat %69 : vector<4xf32>
          %71 = vector.fma %70, %29, %67 : vector<4xf32>
          %72 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 12)>(%arg1)[%13, %11, %5, %12, %arg6]
          %73 = memref.load %0[%72] : memref<?xf32>
          %74 = vector.insert %73, %cst [0] : f32 into vector<3xf32>
          %75 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 13)>(%arg1)[%13, %11, %5, %12, %arg6]
          %76 = memref.load %0[%75] : memref<?xf32>
          %77 = vector.insert %76, %74 [1] : f32 into vector<3xf32>
          %78 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 14)>(%arg1)[%13, %11, %5, %12, %arg6]
          %79 = memref.load %0[%78] : memref<?xf32>
          %80 = vector.insert %79, %77 [2] : f32 into vector<3xf32>
          %81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %82 = vector.extract %81[0] : vector<1xf32>
          %83 = splat %82 : vector<4xf32>
          %84 = vector.fma %83, %25, %arg9 : vector<4xf32>
          %85 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %86 = vector.extract %85[0] : vector<1xf32>
          %87 = splat %86 : vector<4xf32>
          %88 = vector.fma %87, %27, %84 : vector<4xf32>
          %89 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %90 = vector.extract %89[0] : vector<1xf32>
          %91 = splat %90 : vector<4xf32>
          %92 = vector.fma %91, %29, %88 : vector<4xf32>
          %93 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 18)>(%arg1)[%13, %11, %5, %12, %arg6]
          %94 = memref.load %0[%93] : memref<?xf32>
          %95 = vector.insert %94, %cst [0] : f32 into vector<3xf32>
          %96 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 19)>(%arg1)[%13, %11, %5, %12, %arg6]
          %97 = memref.load %0[%96] : memref<?xf32>
          %98 = vector.insert %97, %95 [1] : f32 into vector<3xf32>
          %99 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 20)>(%arg1)[%13, %11, %5, %12, %arg6]
          %100 = memref.load %0[%99] : memref<?xf32>
          %101 = vector.insert %100, %98 [2] : f32 into vector<3xf32>
          %102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %103 = vector.extract %102[0] : vector<1xf32>
          %104 = splat %103 : vector<4xf32>
          %105 = vector.fma %104, %25, %arg10 : vector<4xf32>
          %106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %107 = vector.extract %106[0] : vector<1xf32>
          %108 = splat %107 : vector<4xf32>
          %109 = vector.fma %108, %27, %105 : vector<4xf32>
          %110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %111 = vector.extract %110[0] : vector<1xf32>
          %112 = splat %111 : vector<4xf32>
          %113 = vector.fma %112, %29, %109 : vector<4xf32>
          scf.yield %50, %71, %92, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
        }
        scf.yield %21#0, %21#1, %21#2, %21#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
      %16 = divi_signed %15, %c4 : index
      %17 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 24)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#3, %2[%17] : memref<?xvector<4xf32>>
      %18 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 16)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#2, %2[%18] : memref<?xvector<4xf32>>
      %19 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 8)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#1, %2[%19] : memref<?xvector<4xf32>>
      %20 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8)>(%16)[%arg0, %8, %3, %9]
      memref.store %14#0, %2[%20] : memref<?xvector<4xf32>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After ConvertAffineToStandard //----- //
module  {
  func @conv_dispatch_0() {
    %c100352 = constant 100352 : index
    %c216 = constant 216 : index
    %c151875 = constant 151875 : index
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c112 = constant 112 : index
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c16 = constant 16 : index
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c8 = constant 8 : index
    %cst = constant dense<0.000000e+00> : vector<3xf32>
    %cst_0 = constant dense<0.000000e+00> : vector<4xf32>
    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_y, %c8 : index
    %4 = muli %workgroup_id_x, %c32 : index
    %5 = muli %workgroup_id_y, %c16 : index
    %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %7 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %8 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %9 = muli %7, %c4 : index
    %10 = muli %6, %c4 : index
    %11 = muli %8, %c2 : index
    %12 = muli %7, %c8 : index
    scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
      %13 = muli %arg0, %c2 : index
      %14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst_0, %arg3 = %cst_0, %arg4 = %cst_0, %arg5 = %cst_0) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %52:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
          %53 = addi %4, %10 : index
          %54 = divi_signed %53, %c4 : index
          %c72 = constant 72 : index
          %55 = muli %arg1, %c72 : index
          %c24_18 = constant 24 : index
          %56 = muli %arg6, %c24_18 : index
          %57 = addi %55, %56 : index
          %58 = addi %57, %54 : index
          %59 = memref.load %1[%58] : memref<?xvector<4xf32>>
          %c72_19 = constant 72 : index
          %60 = muli %arg1, %c72_19 : index
          %c24_20 = constant 24 : index
          %61 = muli %arg6, %c24_20 : index
          %62 = addi %60, %61 : index
          %63 = addi %62, %54 : index
          %c8_21 = constant 8 : index
          %64 = addi %63, %c8_21 : index
          %65 = memref.load %1[%64] : memref<?xvector<4xf32>>
          %c72_22 = constant 72 : index
          %66 = muli %arg1, %c72_22 : index
          %c24_23 = constant 24 : index
          %67 = muli %arg6, %c24_23 : index
          %68 = addi %66, %67 : index
          %69 = addi %68, %54 : index
          %c16_24 = constant 16 : index
          %70 = addi %69, %c16_24 : index
          %71 = memref.load %1[%70] : memref<?xvector<4xf32>>
          %c675 = constant 675 : index
          %72 = muli %arg1, %c675 : index
          %c3_25 = constant 3 : index
          %73 = muli %arg6, %c3_25 : index
          %74 = addi %72, %73 : index
          %c675_26 = constant 675 : index
          %75 = muli %13, %c675_26 : index
          %76 = addi %74, %75 : index
          %c675_27 = constant 675 : index
          %77 = muli %11, %c675_27 : index
          %78 = addi %76, %77 : index
          %c3_28 = constant 3 : index
          %79 = muli %5, %c3_28 : index
          %80 = addi %78, %79 : index
          %c3_29 = constant 3 : index
          %81 = muli %12, %c3_29 : index
          %82 = addi %80, %81 : index
          %83 = memref.load %0[%82] : memref<?xf32>
          %84 = vector.insert %83, %cst [0] : f32 into vector<3xf32>
          %c675_30 = constant 675 : index
          %85 = muli %arg1, %c675_30 : index
          %c3_31 = constant 3 : index
          %86 = muli %arg6, %c3_31 : index
          %87 = addi %85, %86 : index
          %c675_32 = constant 675 : index
          %88 = muli %13, %c675_32 : index
          %89 = addi %87, %88 : index
          %c675_33 = constant 675 : index
          %90 = muli %11, %c675_33 : index
          %91 = addi %89, %90 : index
          %c3_34 = constant 3 : index
          %92 = muli %5, %c3_34 : index
          %93 = addi %91, %92 : index
          %c3_35 = constant 3 : index
          %94 = muli %12, %c3_35 : index
          %95 = addi %93, %94 : index
          %c1_36 = constant 1 : index
          %96 = addi %95, %c1_36 : index
          %97 = memref.load %0[%96] : memref<?xf32>
          %98 = vector.insert %97, %84 [1] : f32 into vector<3xf32>
          %c675_37 = constant 675 : index
          %99 = muli %arg1, %c675_37 : index
          %c3_38 = constant 3 : index
          %100 = muli %arg6, %c3_38 : index
          %101 = addi %99, %100 : index
          %c675_39 = constant 675 : index
          %102 = muli %13, %c675_39 : index
          %103 = addi %101, %102 : index
          %c675_40 = constant 675 : index
          %104 = muli %11, %c675_40 : index
          %105 = addi %103, %104 : index
          %c3_41 = constant 3 : index
          %106 = muli %5, %c3_41 : index
          %107 = addi %105, %106 : index
          %c3_42 = constant 3 : index
          %108 = muli %12, %c3_42 : index
          %109 = addi %107, %108 : index
          %c2_43 = constant 2 : index
          %110 = addi %109, %c2_43 : index
          %111 = memref.load %0[%110] : memref<?xf32>
          %112 = vector.insert %111, %98 [2] : f32 into vector<3xf32>
          %113 = vector.extract_strided_slice %112 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %114 = vector.extract %113[0] : vector<1xf32>
          %115 = splat %114 : vector<4xf32>
          %116 = vector.fma %115, %59, %arg7 : vector<4xf32>
          %117 = vector.extract_strided_slice %112 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %118 = vector.extract %117[0] : vector<1xf32>
          %119 = splat %118 : vector<4xf32>
          %120 = vector.fma %119, %65, %116 : vector<4xf32>
          %121 = vector.extract_strided_slice %112 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %122 = vector.extract %121[0] : vector<1xf32>
          %123 = splat %122 : vector<4xf32>
          %124 = vector.fma %123, %71, %120 : vector<4xf32>
          %c675_44 = constant 675 : index
          %125 = muli %arg1, %c675_44 : index
          %c675_45 = constant 675 : index
          %126 = muli %13, %c675_45 : index
          %127 = addi %125, %126 : index
          %c675_46 = constant 675 : index
          %128 = muli %11, %c675_46 : index
          %129 = addi %127, %128 : index
          %c3_47 = constant 3 : index
          %130 = muli %5, %c3_47 : index
          %131 = addi %129, %130 : index
          %c3_48 = constant 3 : index
          %132 = muli %12, %c3_48 : index
          %133 = addi %131, %132 : index
          %c3_49 = constant 3 : index
          %134 = muli %arg6, %c3_49 : index
          %135 = addi %133, %134 : index
          %c6 = constant 6 : index
          %136 = addi %135, %c6 : index
          %137 = memref.load %0[%136] : memref<?xf32>
          %138 = vector.insert %137, %cst [0] : f32 into vector<3xf32>
          %c675_50 = constant 675 : index
          %139 = muli %arg1, %c675_50 : index
          %c675_51 = constant 675 : index
          %140 = muli %13, %c675_51 : index
          %141 = addi %139, %140 : index
          %c675_52 = constant 675 : index
          %142 = muli %11, %c675_52 : index
          %143 = addi %141, %142 : index
          %c3_53 = constant 3 : index
          %144 = muli %5, %c3_53 : index
          %145 = addi %143, %144 : index
          %c3_54 = constant 3 : index
          %146 = muli %12, %c3_54 : index
          %147 = addi %145, %146 : index
          %c3_55 = constant 3 : index
          %148 = muli %arg6, %c3_55 : index
          %149 = addi %147, %148 : index
          %c7 = constant 7 : index
          %150 = addi %149, %c7 : index
          %151 = memref.load %0[%150] : memref<?xf32>
          %152 = vector.insert %151, %138 [1] : f32 into vector<3xf32>
          %c675_56 = constant 675 : index
          %153 = muli %arg1, %c675_56 : index
          %c675_57 = constant 675 : index
          %154 = muli %13, %c675_57 : index
          %155 = addi %153, %154 : index
          %c675_58 = constant 675 : index
          %156 = muli %11, %c675_58 : index
          %157 = addi %155, %156 : index
          %c3_59 = constant 3 : index
          %158 = muli %5, %c3_59 : index
          %159 = addi %157, %158 : index
          %c3_60 = constant 3 : index
          %160 = muli %12, %c3_60 : index
          %161 = addi %159, %160 : index
          %c3_61 = constant 3 : index
          %162 = muli %arg6, %c3_61 : index
          %163 = addi %161, %162 : index
          %c8_62 = constant 8 : index
          %164 = addi %163, %c8_62 : index
          %165 = memref.load %0[%164] : memref<?xf32>
          %166 = vector.insert %165, %152 [2] : f32 into vector<3xf32>
          %167 = vector.extract_strided_slice %166 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %168 = vector.extract %167[0] : vector<1xf32>
          %169 = splat %168 : vector<4xf32>
          %170 = vector.fma %169, %59, %arg8 : vector<4xf32>
          %171 = vector.extract_strided_slice %166 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %172 = vector.extract %171[0] : vector<1xf32>
          %173 = splat %172 : vector<4xf32>
          %174 = vector.fma %173, %65, %170 : vector<4xf32>
          %175 = vector.extract_strided_slice %166 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %176 = vector.extract %175[0] : vector<1xf32>
          %177 = splat %176 : vector<4xf32>
          %178 = vector.fma %177, %71, %174 : vector<4xf32>
          %c675_63 = constant 675 : index
          %179 = muli %arg1, %c675_63 : index
          %c675_64 = constant 675 : index
          %180 = muli %13, %c675_64 : index
          %181 = addi %179, %180 : index
          %c675_65 = constant 675 : index
          %182 = muli %11, %c675_65 : index
          %183 = addi %181, %182 : index
          %c3_66 = constant 3 : index
          %184 = muli %5, %c3_66 : index
          %185 = addi %183, %184 : index
          %c3_67 = constant 3 : index
          %186 = muli %12, %c3_67 : index
          %187 = addi %185, %186 : index
          %c3_68 = constant 3 : index
          %188 = muli %arg6, %c3_68 : index
          %189 = addi %187, %188 : index
          %c12 = constant 12 : index
          %190 = addi %189, %c12 : index
          %191 = memref.load %0[%190] : memref<?xf32>
          %192 = vector.insert %191, %cst [0] : f32 into vector<3xf32>
          %c675_69 = constant 675 : index
          %193 = muli %arg1, %c675_69 : index
          %c675_70 = constant 675 : index
          %194 = muli %13, %c675_70 : index
          %195 = addi %193, %194 : index
          %c675_71 = constant 675 : index
          %196 = muli %11, %c675_71 : index
          %197 = addi %195, %196 : index
          %c3_72 = constant 3 : index
          %198 = muli %5, %c3_72 : index
          %199 = addi %197, %198 : index
          %c3_73 = constant 3 : index
          %200 = muli %12, %c3_73 : index
          %201 = addi %199, %200 : index
          %c3_74 = constant 3 : index
          %202 = muli %arg6, %c3_74 : index
          %203 = addi %201, %202 : index
          %c13 = constant 13 : index
          %204 = addi %203, %c13 : index
          %205 = memref.load %0[%204] : memref<?xf32>
          %206 = vector.insert %205, %192 [1] : f32 into vector<3xf32>
          %c675_75 = constant 675 : index
          %207 = muli %arg1, %c675_75 : index
          %c675_76 = constant 675 : index
          %208 = muli %13, %c675_76 : index
          %209 = addi %207, %208 : index
          %c675_77 = constant 675 : index
          %210 = muli %11, %c675_77 : index
          %211 = addi %209, %210 : index
          %c3_78 = constant 3 : index
          %212 = muli %5, %c3_78 : index
          %213 = addi %211, %212 : index
          %c3_79 = constant 3 : index
          %214 = muli %12, %c3_79 : index
          %215 = addi %213, %214 : index
          %c3_80 = constant 3 : index
          %216 = muli %arg6, %c3_80 : index
          %217 = addi %215, %216 : index
          %c14 = constant 14 : index
          %218 = addi %217, %c14 : index
          %219 = memref.load %0[%218] : memref<?xf32>
          %220 = vector.insert %219, %206 [2] : f32 into vector<3xf32>
          %221 = vector.extract_strided_slice %220 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %222 = vector.extract %221[0] : vector<1xf32>
          %223 = splat %222 : vector<4xf32>
          %224 = vector.fma %223, %59, %arg9 : vector<4xf32>
          %225 = vector.extract_strided_slice %220 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %226 = vector.extract %225[0] : vector<1xf32>
          %227 = splat %226 : vector<4xf32>
          %228 = vector.fma %227, %65, %224 : vector<4xf32>
          %229 = vector.extract_strided_slice %220 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %230 = vector.extract %229[0] : vector<1xf32>
          %231 = splat %230 : vector<4xf32>
          %232 = vector.fma %231, %71, %228 : vector<4xf32>
          %c675_81 = constant 675 : index
          %233 = muli %arg1, %c675_81 : index
          %c675_82 = constant 675 : index
          %234 = muli %13, %c675_82 : index
          %235 = addi %233, %234 : index
          %c675_83 = constant 675 : index
          %236 = muli %11, %c675_83 : index
          %237 = addi %235, %236 : index
          %c3_84 = constant 3 : index
          %238 = muli %5, %c3_84 : index
          %239 = addi %237, %238 : index
          %c3_85 = constant 3 : index
          %240 = muli %12, %c3_85 : index
          %241 = addi %239, %240 : index
          %c3_86 = constant 3 : index
          %242 = muli %arg6, %c3_86 : index
          %243 = addi %241, %242 : index
          %c18 = constant 18 : index
          %244 = addi %243, %c18 : index
          %245 = memref.load %0[%244] : memref<?xf32>
          %246 = vector.insert %245, %cst [0] : f32 into vector<3xf32>
          %c675_87 = constant 675 : index
          %247 = muli %arg1, %c675_87 : index
          %c675_88 = constant 675 : index
          %248 = muli %13, %c675_88 : index
          %249 = addi %247, %248 : index
          %c675_89 = constant 675 : index
          %250 = muli %11, %c675_89 : index
          %251 = addi %249, %250 : index
          %c3_90 = constant 3 : index
          %252 = muli %5, %c3_90 : index
          %253 = addi %251, %252 : index
          %c3_91 = constant 3 : index
          %254 = muli %12, %c3_91 : index
          %255 = addi %253, %254 : index
          %c3_92 = constant 3 : index
          %256 = muli %arg6, %c3_92 : index
          %257 = addi %255, %256 : index
          %c19 = constant 19 : index
          %258 = addi %257, %c19 : index
          %259 = memref.load %0[%258] : memref<?xf32>
          %260 = vector.insert %259, %246 [1] : f32 into vector<3xf32>
          %c675_93 = constant 675 : index
          %261 = muli %arg1, %c675_93 : index
          %c675_94 = constant 675 : index
          %262 = muli %13, %c675_94 : index
          %263 = addi %261, %262 : index
          %c675_95 = constant 675 : index
          %264 = muli %11, %c675_95 : index
          %265 = addi %263, %264 : index
          %c3_96 = constant 3 : index
          %266 = muli %5, %c3_96 : index
          %267 = addi %265, %266 : index
          %c3_97 = constant 3 : index
          %268 = muli %12, %c3_97 : index
          %269 = addi %267, %268 : index
          %c3_98 = constant 3 : index
          %270 = muli %arg6, %c3_98 : index
          %271 = addi %269, %270 : index
          %c20 = constant 20 : index
          %272 = addi %271, %c20 : index
          %273 = memref.load %0[%272] : memref<?xf32>
          %274 = vector.insert %273, %260 [2] : f32 into vector<3xf32>
          %275 = vector.extract_strided_slice %274 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %276 = vector.extract %275[0] : vector<1xf32>
          %277 = splat %276 : vector<4xf32>
          %278 = vector.fma %277, %59, %arg10 : vector<4xf32>
          %279 = vector.extract_strided_slice %274 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %280 = vector.extract %279[0] : vector<1xf32>
          %281 = splat %280 : vector<4xf32>
          %282 = vector.fma %281, %65, %278 : vector<4xf32>
          %283 = vector.extract_strided_slice %274 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
          %284 = vector.extract %283[0] : vector<1xf32>
          %285 = splat %284 : vector<4xf32>
          %286 = vector.fma %285, %71, %282 : vector<4xf32>
          scf.yield %124, %178, %232, %286 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
        }
        scf.yield %52#0, %52#1, %52#2, %52#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      %15 = addi %4, %10 : index
      %16 = divi_signed %15, %c4 : index
      %c896 = constant 896 : index
      %17 = muli %arg0, %c896 : index
      %18 = addi %16, %17 : index
      %c896_1 = constant 896 : index
      %19 = muli %8, %c896_1 : index
      %20 = addi %18, %19 : index
      %c8_2 = constant 8 : index
      %21 = muli %3, %c8_2 : index
      %22 = addi %20, %21 : index
      %c8_3 = constant 8 : index
      %23 = muli %9, %c8_3 : index
      %24 = addi %22, %23 : index
      %c24 = constant 24 : index
      %25 = addi %24, %c24 : index
      memref.store %14#3, %2[%25] : memref<?xvector<4xf32>>
      %c896_4 = constant 896 : index
      %26 = muli %arg0, %c896_4 : index
      %27 = addi %16, %26 : index
      %c896_5 = constant 896 : index
      %28 = muli %8, %c896_5 : index
      %29 = addi %27, %28 : index
      %c8_6 = constant 8 : index
      %30 = muli %3, %c8_6 : index
      %31 = addi %29, %30 : index
      %c8_7 = constant 8 : index
      %32 = muli %9, %c8_7 : index
      %33 = addi %31, %32 : index
      %c16_8 = constant 16 : index
      %34 = addi %33, %c16_8 : index
      memref.store %14#2, %2[%34] : memref<?xvector<4xf32>>
      %c896_9 = constant 896 : index
      %35 = muli %arg0, %c896_9 : index
      %36 = addi %16, %35 : index
      %c896_10 = constant 896 : index
      %37 = muli %8, %c896_10 : index
      %38 = addi %36, %37 : index
      %c8_11 = constant 8 : index
      %39 = muli %3, %c8_11 : index
      %40 = addi %38, %39 : index
      %c8_12 = constant 8 : index
      %41 = muli %9, %c8_12 : index
      %42 = addi %40, %41 : index
      %c8_13 = constant 8 : index
      %43 = addi %42, %c8_13 : index
      memref.store %14#1, %2[%43] : memref<?xvector<4xf32>>
      %c896_14 = constant 896 : index
      %44 = muli %arg0, %c896_14 : index
      %45 = addi %16, %44 : index
      %c896_15 = constant 896 : index
      %46 = muli %8, %c896_15 : index
      %47 = addi %45, %46 : index
      %c8_16 = constant 8 : index
      %48 = muli %3, %c8_16 : index
      %49 = addi %47, %48 : index
      %c8_17 = constant 8 : index
      %50 = muli %9, %c8_17 : index
      %51 = addi %49, %50 : index
      memref.store %14#0, %2[%51] : memref<?xvector<4xf32>>
    }
    return
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After ConvertToSPIRV //----- //
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  spv.module Logical GLSL450 {
    spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    spv.func @conv_dispatch_0() "None" attributes {spv.entry_point_abi = {local_size = dense<[8, 2, 1]> : vector<3xi32>}} {
      %cst100352_i32 = spv.Constant 100352 : i32
      %cst216_i32 = spv.Constant 216 : i32
      %cst151875_i32 = spv.Constant 151875 : i32
      %cst1_i32 = spv.Constant 1 : i32
      %cst3_i32 = spv.Constant 3 : i32
      %cst112_i32 = spv.Constant 112 : i32
      %cst0_i32 = spv.Constant 0 : i32
      %cst32_i32 = spv.Constant 32 : i32
      %cst16_i32 = spv.Constant 16 : i32
      %cst4_i32 = spv.Constant 4 : i32
      %cst2_i32 = spv.Constant 2 : i32
      %cst8_i32 = spv.Constant 8 : i32
      %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
      %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
      %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
      %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
      %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
      %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
      %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
      %__builtin_var_WorkgroupId___addr_0 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
      %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
      %__builtin_var_WorkgroupId___addr_1 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_1 : vector<3xi32>
      %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
      %6 = spv.IMul %3, %cst8_i32 : i32
      %7 = spv.IMul %1, %cst32_i32 : i32
      %8 = spv.IMul %3, %cst16_i32 : i32
      %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
      %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
      %__builtin_var_LocalInvocationId___addr_2 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_2 : vector<3xi32>
      %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
      %__builtin_var_LocalInvocationId___addr_3 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_3 : vector<3xi32>
      %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
      %15 = spv.IMul %12, %cst4_i32 : i32
      %16 = spv.IMul %10, %cst4_i32 : i32
      %17 = spv.IMul %14, %cst2_i32 : i32
      %18 = spv.IMul %12, %cst8_i32 : i32
      spv.mlir.loop {
        spv.Branch ^bb1(%5 : i32)
      ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
        %20 = spv.SLessThan %19, %cst112_i32 : i32
        spv.BranchConditional %20, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %21 = spv.IMul %19, %cst2_i32 : i32
        %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        spv.mlir.loop {
          spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb1(%80: i32, %81: vector<4xf32>, %82: vector<4xf32>, %83: vector<4xf32>, %84: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
          %85 = spv.SLessThan %80, %cst3_i32 : i32
          spv.BranchConditional %85, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %86 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %87 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %88 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %89 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          spv.mlir.loop {
            spv.Branch ^bb1(%cst0_i32, %81, %82, %83, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
          ^bb1(%95: i32, %96: vector<4xf32>, %97: vector<4xf32>, %98: vector<4xf32>, %99: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
            %100 = spv.SLessThan %95, %cst3_i32 : i32
            spv.BranchConditional %100, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %101 = spv.IAdd %7, %16 : i32
            %102 = spv.SDiv %101, %cst4_i32 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %103 = spv.IMul %80, %cst72_i32 : i32
            %cst24_i32_33 = spv.Constant 24 : i32
            %104 = spv.IMul %95, %cst24_i32_33 : i32
            %105 = spv.IAdd %103, %104 : i32
            %106 = spv.IAdd %105, %102 : i32
            %cst0_i32_34 = spv.Constant 0 : i32
            %cst0_i32_35 = spv.Constant 0 : i32
            %cst1_i32_36 = spv.Constant 1 : i32
            %107 = spv.IMul %cst1_i32_36, %106 : i32
            %108 = spv.IAdd %cst0_i32_35, %107 : i32
            %109 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_34, %108] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
            %cst72_i32_37 = spv.Constant 72 : i32
            %111 = spv.IMul %80, %cst72_i32_37 : i32
            %cst24_i32_38 = spv.Constant 24 : i32
            %112 = spv.IMul %95, %cst24_i32_38 : i32
            %113 = spv.IAdd %111, %112 : i32
            %114 = spv.IAdd %113, %102 : i32
            %cst8_i32_39 = spv.Constant 8 : i32
            %115 = spv.IAdd %114, %cst8_i32_39 : i32
            %cst0_i32_40 = spv.Constant 0 : i32
            %cst0_i32_41 = spv.Constant 0 : i32
            %cst1_i32_42 = spv.Constant 1 : i32
            %116 = spv.IMul %cst1_i32_42, %115 : i32
            %117 = spv.IAdd %cst0_i32_41, %116 : i32
            %118 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_40, %117] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            %119 = spv.Load "StorageBuffer" %118 : vector<4xf32>
            %cst72_i32_43 = spv.Constant 72 : i32
            %120 = spv.IMul %80, %cst72_i32_43 : i32
            %cst24_i32_44 = spv.Constant 24 : i32
            %121 = spv.IMul %95, %cst24_i32_44 : i32
            %122 = spv.IAdd %120, %121 : i32
            %123 = spv.IAdd %122, %102 : i32
            %cst16_i32_45 = spv.Constant 16 : i32
            %124 = spv.IAdd %123, %cst16_i32_45 : i32
            %cst0_i32_46 = spv.Constant 0 : i32
            %cst0_i32_47 = spv.Constant 0 : i32
            %cst1_i32_48 = spv.Constant 1 : i32
            %125 = spv.IMul %cst1_i32_48, %124 : i32
            %126 = spv.IAdd %cst0_i32_47, %125 : i32
            %127 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_46, %126] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            %128 = spv.Load "StorageBuffer" %127 : vector<4xf32>
            %cst675_i32 = spv.Constant 675 : i32
            %129 = spv.IMul %80, %cst675_i32 : i32
            %cst3_i32_49 = spv.Constant 3 : i32
            %130 = spv.IMul %95, %cst3_i32_49 : i32
            %131 = spv.IAdd %129, %130 : i32
            %cst675_i32_50 = spv.Constant 675 : i32
            %132 = spv.IMul %21, %cst675_i32_50 : i32
            %133 = spv.IAdd %131, %132 : i32
            %cst675_i32_51 = spv.Constant 675 : i32
            %134 = spv.IMul %17, %cst675_i32_51 : i32
            %135 = spv.IAdd %133, %134 : i32
            %cst3_i32_52 = spv.Constant 3 : i32
            %136 = spv.IMul %8, %cst3_i32_52 : i32
            %137 = spv.IAdd %135, %136 : i32
            %cst3_i32_53 = spv.Constant 3 : i32
            %138 = spv.IMul %18, %cst3_i32_53 : i32
            %139 = spv.IAdd %137, %138 : i32
            %cst0_i32_54 = spv.Constant 0 : i32
            %cst0_i32_55 = spv.Constant 0 : i32
            %cst1_i32_56 = spv.Constant 1 : i32
            %140 = spv.IMul %cst1_i32_56, %139 : i32
            %141 = spv.IAdd %cst0_i32_55, %140 : i32
            %142 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_54, %141] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %143 = spv.Load "StorageBuffer" %142 : f32
            %144 = spv.CompositeInsert %143, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
            %cst675_i32_57 = spv.Constant 675 : i32
            %145 = spv.IMul %80, %cst675_i32_57 : i32
            %cst3_i32_58 = spv.Constant 3 : i32
            %146 = spv.IMul %95, %cst3_i32_58 : i32
            %147 = spv.IAdd %145, %146 : i32
            %cst675_i32_59 = spv.Constant 675 : i32
            %148 = spv.IMul %21, %cst675_i32_59 : i32
            %149 = spv.IAdd %147, %148 : i32
            %cst675_i32_60 = spv.Constant 675 : i32
            %150 = spv.IMul %17, %cst675_i32_60 : i32
            %151 = spv.IAdd %149, %150 : i32
            %cst3_i32_61 = spv.Constant 3 : i32
            %152 = spv.IMul %8, %cst3_i32_61 : i32
            %153 = spv.IAdd %151, %152 : i32
            %cst3_i32_62 = spv.Constant 3 : i32
            %154 = spv.IMul %18, %cst3_i32_62 : i32
            %155 = spv.IAdd %153, %154 : i32
            %cst1_i32_63 = spv.Constant 1 : i32
            %156 = spv.IAdd %155, %cst1_i32_63 : i32
            %cst0_i32_64 = spv.Constant 0 : i32
            %cst0_i32_65 = spv.Constant 0 : i32
            %cst1_i32_66 = spv.Constant 1 : i32
            %157 = spv.IMul %cst1_i32_66, %156 : i32
            %158 = spv.IAdd %cst0_i32_65, %157 : i32
            %159 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_64, %158] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %160 = spv.Load "StorageBuffer" %159 : f32
            %161 = spv.CompositeInsert %160, %144[1 : i32] : f32 into vector<3xf32>
            %cst675_i32_67 = spv.Constant 675 : i32
            %162 = spv.IMul %80, %cst675_i32_67 : i32
            %cst3_i32_68 = spv.Constant 3 : i32
            %163 = spv.IMul %95, %cst3_i32_68 : i32
            %164 = spv.IAdd %162, %163 : i32
            %cst675_i32_69 = spv.Constant 675 : i32
            %165 = spv.IMul %21, %cst675_i32_69 : i32
            %166 = spv.IAdd %164, %165 : i32
            %cst675_i32_70 = spv.Constant 675 : i32
            %167 = spv.IMul %17, %cst675_i32_70 : i32
            %168 = spv.IAdd %166, %167 : i32
            %cst3_i32_71 = spv.Constant 3 : i32
            %169 = spv.IMul %8, %cst3_i32_71 : i32
            %170 = spv.IAdd %168, %169 : i32
            %cst3_i32_72 = spv.Constant 3 : i32
            %171 = spv.IMul %18, %cst3_i32_72 : i32
            %172 = spv.IAdd %170, %171 : i32
            %cst2_i32_73 = spv.Constant 2 : i32
            %173 = spv.IAdd %172, %cst2_i32_73 : i32
            %cst0_i32_74 = spv.Constant 0 : i32
            %cst0_i32_75 = spv.Constant 0 : i32
            %cst1_i32_76 = spv.Constant 1 : i32
            %174 = spv.IMul %cst1_i32_76, %173 : i32
            %175 = spv.IAdd %cst0_i32_75, %174 : i32
            %176 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_74, %175] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %177 = spv.Load "StorageBuffer" %176 : f32
            %178 = spv.CompositeInsert %177, %161[2 : i32] : f32 into vector<3xf32>
            %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
            %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
            %181 = spv.GLSL.Fma %180, %110, %96 : vector<4xf32>
            %182 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
            %183 = spv.CompositeConstruct %182, %182, %182, %182 : vector<4xf32>
            %184 = spv.GLSL.Fma %183, %119, %181 : vector<4xf32>
            %185 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
            %186 = spv.CompositeConstruct %185, %185, %185, %185 : vector<4xf32>
            %187 = spv.GLSL.Fma %186, %128, %184 : vector<4xf32>
            %cst675_i32_77 = spv.Constant 675 : i32
            %188 = spv.IMul %80, %cst675_i32_77 : i32
            %cst675_i32_78 = spv.Constant 675 : i32
            %189 = spv.IMul %21, %cst675_i32_78 : i32
            %190 = spv.IAdd %188, %189 : i32
            %cst675_i32_79 = spv.Constant 675 : i32
            %191 = spv.IMul %17, %cst675_i32_79 : i32
            %192 = spv.IAdd %190, %191 : i32
            %cst3_i32_80 = spv.Constant 3 : i32
            %193 = spv.IMul %8, %cst3_i32_80 : i32
            %194 = spv.IAdd %192, %193 : i32
            %cst3_i32_81 = spv.Constant 3 : i32
            %195 = spv.IMul %18, %cst3_i32_81 : i32
            %196 = spv.IAdd %194, %195 : i32
            %cst3_i32_82 = spv.Constant 3 : i32
            %197 = spv.IMul %95, %cst3_i32_82 : i32
            %198 = spv.IAdd %196, %197 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %199 = spv.IAdd %198, %cst6_i32 : i32
            %cst0_i32_83 = spv.Constant 0 : i32
            %cst0_i32_84 = spv.Constant 0 : i32
            %cst1_i32_85 = spv.Constant 1 : i32
            %200 = spv.IMul %cst1_i32_85, %199 : i32
            %201 = spv.IAdd %cst0_i32_84, %200 : i32
            %202 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_83, %201] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %203 = spv.Load "StorageBuffer" %202 : f32
            %204 = spv.CompositeInsert %203, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
            %cst675_i32_86 = spv.Constant 675 : i32
            %205 = spv.IMul %80, %cst675_i32_86 : i32
            %cst675_i32_87 = spv.Constant 675 : i32
            %206 = spv.IMul %21, %cst675_i32_87 : i32
            %207 = spv.IAdd %205, %206 : i32
            %cst675_i32_88 = spv.Constant 675 : i32
            %208 = spv.IMul %17, %cst675_i32_88 : i32
            %209 = spv.IAdd %207, %208 : i32
            %cst3_i32_89 = spv.Constant 3 : i32
            %210 = spv.IMul %8, %cst3_i32_89 : i32
            %211 = spv.IAdd %209, %210 : i32
            %cst3_i32_90 = spv.Constant 3 : i32
            %212 = spv.IMul %18, %cst3_i32_90 : i32
            %213 = spv.IAdd %211, %212 : i32
            %cst3_i32_91 = spv.Constant 3 : i32
            %214 = spv.IMul %95, %cst3_i32_91 : i32
            %215 = spv.IAdd %213, %214 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %216 = spv.IAdd %215, %cst7_i32 : i32
            %cst0_i32_92 = spv.Constant 0 : i32
            %cst0_i32_93 = spv.Constant 0 : i32
            %cst1_i32_94 = spv.Constant 1 : i32
            %217 = spv.IMul %cst1_i32_94, %216 : i32
            %218 = spv.IAdd %cst0_i32_93, %217 : i32
            %219 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_92, %218] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %220 = spv.Load "StorageBuffer" %219 : f32
            %221 = spv.CompositeInsert %220, %204[1 : i32] : f32 into vector<3xf32>
            %cst675_i32_95 = spv.Constant 675 : i32
            %222 = spv.IMul %80, %cst675_i32_95 : i32
            %cst675_i32_96 = spv.Constant 675 : i32
            %223 = spv.IMul %21, %cst675_i32_96 : i32
            %224 = spv.IAdd %222, %223 : i32
            %cst675_i32_97 = spv.Constant 675 : i32
            %225 = spv.IMul %17, %cst675_i32_97 : i32
            %226 = spv.IAdd %224, %225 : i32
            %cst3_i32_98 = spv.Constant 3 : i32
            %227 = spv.IMul %8, %cst3_i32_98 : i32
            %228 = spv.IAdd %226, %227 : i32
            %cst3_i32_99 = spv.Constant 3 : i32
            %229 = spv.IMul %18, %cst3_i32_99 : i32
            %230 = spv.IAdd %228, %229 : i32
            %cst3_i32_100 = spv.Constant 3 : i32
            %231 = spv.IMul %95, %cst3_i32_100 : i32
            %232 = spv.IAdd %230, %231 : i32
            %cst8_i32_101 = spv.Constant 8 : i32
            %233 = spv.IAdd %232, %cst8_i32_101 : i32
            %cst0_i32_102 = spv.Constant 0 : i32
            %cst0_i32_103 = spv.Constant 0 : i32
            %cst1_i32_104 = spv.Constant 1 : i32
            %234 = spv.IMul %cst1_i32_104, %233 : i32
            %235 = spv.IAdd %cst0_i32_103, %234 : i32
            %236 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_102, %235] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %237 = spv.Load "StorageBuffer" %236 : f32
            %238 = spv.CompositeInsert %237, %221[2 : i32] : f32 into vector<3xf32>
            %239 = spv.CompositeExtract %238[0 : i32] : vector<3xf32>
            %240 = spv.CompositeConstruct %239, %239, %239, %239 : vector<4xf32>
            %241 = spv.GLSL.Fma %240, %110, %97 : vector<4xf32>
            %242 = spv.CompositeExtract %238[1 : i32] : vector<3xf32>
            %243 = spv.CompositeConstruct %242, %242, %242, %242 : vector<4xf32>
            %244 = spv.GLSL.Fma %243, %119, %241 : vector<4xf32>
            %245 = spv.CompositeExtract %238[2 : i32] : vector<3xf32>
            %246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
            %247 = spv.GLSL.Fma %246, %128, %244 : vector<4xf32>
            %cst675_i32_105 = spv.Constant 675 : i32
            %248 = spv.IMul %80, %cst675_i32_105 : i32
            %cst675_i32_106 = spv.Constant 675 : i32
            %249 = spv.IMul %21, %cst675_i32_106 : i32
            %250 = spv.IAdd %248, %249 : i32
            %cst675_i32_107 = spv.Constant 675 : i32
            %251 = spv.IMul %17, %cst675_i32_107 : i32
            %252 = spv.IAdd %250, %251 : i32
            %cst3_i32_108 = spv.Constant 3 : i32
            %253 = spv.IMul %8, %cst3_i32_108 : i32
            %254 = spv.IAdd %252, %253 : i32
            %cst3_i32_109 = spv.Constant 3 : i32
            %255 = spv.IMul %18, %cst3_i32_109 : i32
            %256 = spv.IAdd %254, %255 : i32
            %cst3_i32_110 = spv.Constant 3 : i32
            %257 = spv.IMul %95, %cst3_i32_110 : i32
            %258 = spv.IAdd %256, %257 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %259 = spv.IAdd %258, %cst12_i32 : i32
            %cst0_i32_111 = spv.Constant 0 : i32
            %cst0_i32_112 = spv.Constant 0 : i32
            %cst1_i32_113 = spv.Constant 1 : i32
            %260 = spv.IMul %cst1_i32_113, %259 : i32
            %261 = spv.IAdd %cst0_i32_112, %260 : i32
            %262 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_111, %261] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %263 = spv.Load "StorageBuffer" %262 : f32
            %264 = spv.CompositeInsert %263, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
            %cst675_i32_114 = spv.Constant 675 : i32
            %265 = spv.IMul %80, %cst675_i32_114 : i32
            %cst675_i32_115 = spv.Constant 675 : i32
            %266 = spv.IMul %21, %cst675_i32_115 : i32
            %267 = spv.IAdd %265, %266 : i32
            %cst675_i32_116 = spv.Constant 675 : i32
            %268 = spv.IMul %17, %cst675_i32_116 : i32
            %269 = spv.IAdd %267, %268 : i32
            %cst3_i32_117 = spv.Constant 3 : i32
            %270 = spv.IMul %8, %cst3_i32_117 : i32
            %271 = spv.IAdd %269, %270 : i32
            %cst3_i32_118 = spv.Constant 3 : i32
            %272 = spv.IMul %18, %cst3_i32_118 : i32
            %273 = spv.IAdd %271, %272 : i32
            %cst3_i32_119 = spv.Constant 3 : i32
            %274 = spv.IMul %95, %cst3_i32_119 : i32
            %275 = spv.IAdd %273, %274 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %276 = spv.IAdd %275, %cst13_i32 : i32
            %cst0_i32_120 = spv.Constant 0 : i32
            %cst0_i32_121 = spv.Constant 0 : i32
            %cst1_i32_122 = spv.Constant 1 : i32
            %277 = spv.IMul %cst1_i32_122, %276 : i32
            %278 = spv.IAdd %cst0_i32_121, %277 : i32
            %279 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_120, %278] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %280 = spv.Load "StorageBuffer" %279 : f32
            %281 = spv.CompositeInsert %280, %264[1 : i32] : f32 into vector<3xf32>
            %cst675_i32_123 = spv.Constant 675 : i32
            %282 = spv.IMul %80, %cst675_i32_123 : i32
            %cst675_i32_124 = spv.Constant 675 : i32
            %283 = spv.IMul %21, %cst675_i32_124 : i32
            %284 = spv.IAdd %282, %283 : i32
            %cst675_i32_125 = spv.Constant 675 : i32
            %285 = spv.IMul %17, %cst675_i32_125 : i32
            %286 = spv.IAdd %284, %285 : i32
            %cst3_i32_126 = spv.Constant 3 : i32
            %287 = spv.IMul %8, %cst3_i32_126 : i32
            %288 = spv.IAdd %286, %287 : i32
            %cst3_i32_127 = spv.Constant 3 : i32
            %289 = spv.IMul %18, %cst3_i32_127 : i32
            %290 = spv.IAdd %288, %289 : i32
            %cst3_i32_128 = spv.Constant 3 : i32
            %291 = spv.IMul %95, %cst3_i32_128 : i32
            %292 = spv.IAdd %290, %291 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %293 = spv.IAdd %292, %cst14_i32 : i32
            %cst0_i32_129 = spv.Constant 0 : i32
            %cst0_i32_130 = spv.Constant 0 : i32
            %cst1_i32_131 = spv.Constant 1 : i32
            %294 = spv.IMul %cst1_i32_131, %293 : i32
            %295 = spv.IAdd %cst0_i32_130, %294 : i32
            %296 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_129, %295] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %297 = spv.Load "StorageBuffer" %296 : f32
            %298 = spv.CompositeInsert %297, %281[2 : i32] : f32 into vector<3xf32>
            %299 = spv.CompositeExtract %298[0 : i32] : vector<3xf32>
            %300 = spv.CompositeConstruct %299, %299, %299, %299 : vector<4xf32>
            %301 = spv.GLSL.Fma %300, %110, %98 : vector<4xf32>
            %302 = spv.CompositeExtract %298[1 : i32] : vector<3xf32>
            %303 = spv.CompositeConstruct %302, %302, %302, %302 : vector<4xf32>
            %304 = spv.GLSL.Fma %303, %119, %301 : vector<4xf32>
            %305 = spv.CompositeExtract %298[2 : i32] : vector<3xf32>
            %306 = spv.CompositeConstruct %305, %305, %305, %305 : vector<4xf32>
            %307 = spv.GLSL.Fma %306, %128, %304 : vector<4xf32>
            %cst675_i32_132 = spv.Constant 675 : i32
            %308 = spv.IMul %80, %cst675_i32_132 : i32
            %cst675_i32_133 = spv.Constant 675 : i32
            %309 = spv.IMul %21, %cst675_i32_133 : i32
            %310 = spv.IAdd %308, %309 : i32
            %cst675_i32_134 = spv.Constant 675 : i32
            %311 = spv.IMul %17, %cst675_i32_134 : i32
            %312 = spv.IAdd %310, %311 : i32
            %cst3_i32_135 = spv.Constant 3 : i32
            %313 = spv.IMul %8, %cst3_i32_135 : i32
            %314 = spv.IAdd %312, %313 : i32
            %cst3_i32_136 = spv.Constant 3 : i32
            %315 = spv.IMul %18, %cst3_i32_136 : i32
            %316 = spv.IAdd %314, %315 : i32
            %cst3_i32_137 = spv.Constant 3 : i32
            %317 = spv.IMul %95, %cst3_i32_137 : i32
            %318 = spv.IAdd %316, %317 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %319 = spv.IAdd %318, %cst18_i32 : i32
            %cst0_i32_138 = spv.Constant 0 : i32
            %cst0_i32_139 = spv.Constant 0 : i32
            %cst1_i32_140 = spv.Constant 1 : i32
            %320 = spv.IMul %cst1_i32_140, %319 : i32
            %321 = spv.IAdd %cst0_i32_139, %320 : i32
            %322 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_138, %321] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %323 = spv.Load "StorageBuffer" %322 : f32
            %324 = spv.CompositeInsert %323, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
            %cst675_i32_141 = spv.Constant 675 : i32
            %325 = spv.IMul %80, %cst675_i32_141 : i32
            %cst675_i32_142 = spv.Constant 675 : i32
            %326 = spv.IMul %21, %cst675_i32_142 : i32
            %327 = spv.IAdd %325, %326 : i32
            %cst675_i32_143 = spv.Constant 675 : i32
            %328 = spv.IMul %17, %cst675_i32_143 : i32
            %329 = spv.IAdd %327, %328 : i32
            %cst3_i32_144 = spv.Constant 3 : i32
            %330 = spv.IMul %8, %cst3_i32_144 : i32
            %331 = spv.IAdd %329, %330 : i32
            %cst3_i32_145 = spv.Constant 3 : i32
            %332 = spv.IMul %18, %cst3_i32_145 : i32
            %333 = spv.IAdd %331, %332 : i32
            %cst3_i32_146 = spv.Constant 3 : i32
            %334 = spv.IMul %95, %cst3_i32_146 : i32
            %335 = spv.IAdd %333, %334 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %336 = spv.IAdd %335, %cst19_i32 : i32
            %cst0_i32_147 = spv.Constant 0 : i32
            %cst0_i32_148 = spv.Constant 0 : i32
            %cst1_i32_149 = spv.Constant 1 : i32
            %337 = spv.IMul %cst1_i32_149, %336 : i32
            %338 = spv.IAdd %cst0_i32_148, %337 : i32
            %339 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_147, %338] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %340 = spv.Load "StorageBuffer" %339 : f32
            %341 = spv.CompositeInsert %340, %324[1 : i32] : f32 into vector<3xf32>
            %cst675_i32_150 = spv.Constant 675 : i32
            %342 = spv.IMul %80, %cst675_i32_150 : i32
            %cst675_i32_151 = spv.Constant 675 : i32
            %343 = spv.IMul %21, %cst675_i32_151 : i32
            %344 = spv.IAdd %342, %343 : i32
            %cst675_i32_152 = spv.Constant 675 : i32
            %345 = spv.IMul %17, %cst675_i32_152 : i32
            %346 = spv.IAdd %344, %345 : i32
            %cst3_i32_153 = spv.Constant 3 : i32
            %347 = spv.IMul %8, %cst3_i32_153 : i32
            %348 = spv.IAdd %346, %347 : i32
            %cst3_i32_154 = spv.Constant 3 : i32
            %349 = spv.IMul %18, %cst3_i32_154 : i32
            %350 = spv.IAdd %348, %349 : i32
            %cst3_i32_155 = spv.Constant 3 : i32
            %351 = spv.IMul %95, %cst3_i32_155 : i32
            %352 = spv.IAdd %350, %351 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %353 = spv.IAdd %352, %cst20_i32 : i32
            %cst0_i32_156 = spv.Constant 0 : i32
            %cst0_i32_157 = spv.Constant 0 : i32
            %cst1_i32_158 = spv.Constant 1 : i32
            %354 = spv.IMul %cst1_i32_158, %353 : i32
            %355 = spv.IAdd %cst0_i32_157, %354 : i32
            %356 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_156, %355] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %357 = spv.Load "StorageBuffer" %356 : f32
            %358 = spv.CompositeInsert %357, %341[2 : i32] : f32 into vector<3xf32>
            %359 = spv.CompositeExtract %358[0 : i32] : vector<3xf32>
            %360 = spv.CompositeConstruct %359, %359, %359, %359 : vector<4xf32>
            %361 = spv.GLSL.Fma %360, %110, %99 : vector<4xf32>
            %362 = spv.CompositeExtract %358[1 : i32] : vector<3xf32>
            %363 = spv.CompositeConstruct %362, %362, %362, %362 : vector<4xf32>
            %364 = spv.GLSL.Fma %363, %119, %361 : vector<4xf32>
            %365 = spv.CompositeExtract %358[2 : i32] : vector<3xf32>
            %366 = spv.CompositeConstruct %365, %365, %365, %365 : vector<4xf32>
            %367 = spv.GLSL.Fma %366, %128, %364 : vector<4xf32>
            spv.Store "Function" %86, %187 : vector<4xf32>
            spv.Store "Function" %87, %247 : vector<4xf32>
            spv.Store "Function" %88, %307 : vector<4xf32>
            spv.Store "Function" %89, %367 : vector<4xf32>
            %368 = spv.IAdd %95, %cst1_i32 : i32
            spv.Branch ^bb1(%368, %187, %247, %307, %367 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
          ^bb3:  // pred: ^bb1
            spv.mlir.merge
          }
          %90 = spv.Load "Function" %89 : vector<4xf32>
          %91 = spv.Load "Function" %88 : vector<4xf32>
          %92 = spv.Load "Function" %87 : vector<4xf32>
          %93 = spv.Load "Function" %86 : vector<4xf32>
          spv.Store "Function" %22, %93 : vector<4xf32>
          spv.Store "Function" %23, %92 : vector<4xf32>
          spv.Store "Function" %24, %91 : vector<4xf32>
          spv.Store "Function" %25, %90 : vector<4xf32>
          %94 = spv.IAdd %80, %cst1_i32 : i32
          spv.Branch ^bb1(%94, %93, %92, %91, %90 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        %26 = spv.Load "Function" %25 : vector<4xf32>
        %27 = spv.Load "Function" %24 : vector<4xf32>
        %28 = spv.Load "Function" %23 : vector<4xf32>
        %29 = spv.Load "Function" %22 : vector<4xf32>
        %30 = spv.IAdd %7, %16 : i32
        %31 = spv.SDiv %30, %cst4_i32 : i32
        %cst896_i32 = spv.Constant 896 : i32
        %32 = spv.IMul %19, %cst896_i32 : i32
        %33 = spv.IAdd %31, %32 : i32
        %cst896_i32_4 = spv.Constant 896 : i32
        %34 = spv.IMul %14, %cst896_i32_4 : i32
        %35 = spv.IAdd %33, %34 : i32
        %cst8_i32_5 = spv.Constant 8 : i32
        %36 = spv.IMul %6, %cst8_i32_5 : i32
        %37 = spv.IAdd %35, %36 : i32
        %cst8_i32_6 = spv.Constant 8 : i32
        %38 = spv.IMul %15, %cst8_i32_6 : i32
        %39 = spv.IAdd %37, %38 : i32
        %cst24_i32 = spv.Constant 24 : i32
        %40 = spv.IAdd %39, %cst24_i32 : i32
        %cst0_i32_7 = spv.Constant 0 : i32
        %cst0_i32_8 = spv.Constant 0 : i32
        %cst1_i32_9 = spv.Constant 1 : i32
        %41 = spv.IMul %cst1_i32_9, %40 : i32
        %42 = spv.IAdd %cst0_i32_8, %41 : i32
        %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_7, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %43, %26 : vector<4xf32>
        %cst896_i32_10 = spv.Constant 896 : i32
        %44 = spv.IMul %19, %cst896_i32_10 : i32
        %45 = spv.IAdd %31, %44 : i32
        %cst896_i32_11 = spv.Constant 896 : i32
        %46 = spv.IMul %14, %cst896_i32_11 : i32
        %47 = spv.IAdd %45, %46 : i32
        %cst8_i32_12 = spv.Constant 8 : i32
        %48 = spv.IMul %6, %cst8_i32_12 : i32
        %49 = spv.IAdd %47, %48 : i32
        %cst8_i32_13 = spv.Constant 8 : i32
        %50 = spv.IMul %15, %cst8_i32_13 : i32
        %51 = spv.IAdd %49, %50 : i32
        %cst16_i32_14 = spv.Constant 16 : i32
        %52 = spv.IAdd %51, %cst16_i32_14 : i32
        %cst0_i32_15 = spv.Constant 0 : i32
        %cst0_i32_16 = spv.Constant 0 : i32
        %cst1_i32_17 = spv.Constant 1 : i32
        %53 = spv.IMul %cst1_i32_17, %52 : i32
        %54 = spv.IAdd %cst0_i32_16, %53 : i32
        %55 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_15, %54] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %55, %27 : vector<4xf32>
        %cst896_i32_18 = spv.Constant 896 : i32
        %56 = spv.IMul %19, %cst896_i32_18 : i32
        %57 = spv.IAdd %31, %56 : i32
        %cst896_i32_19 = spv.Constant 896 : i32
        %58 = spv.IMul %14, %cst896_i32_19 : i32
        %59 = spv.IAdd %57, %58 : i32
        %cst8_i32_20 = spv.Constant 8 : i32
        %60 = spv.IMul %6, %cst8_i32_20 : i32
        %61 = spv.IAdd %59, %60 : i32
        %cst8_i32_21 = spv.Constant 8 : i32
        %62 = spv.IMul %15, %cst8_i32_21 : i32
        %63 = spv.IAdd %61, %62 : i32
        %cst8_i32_22 = spv.Constant 8 : i32
        %64 = spv.IAdd %63, %cst8_i32_22 : i32
        %cst0_i32_23 = spv.Constant 0 : i32
        %cst0_i32_24 = spv.Constant 0 : i32
        %cst1_i32_25 = spv.Constant 1 : i32
        %65 = spv.IMul %cst1_i32_25, %64 : i32
        %66 = spv.IAdd %cst0_i32_24, %65 : i32
        %67 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_23, %66] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %67, %28 : vector<4xf32>
        %cst896_i32_26 = spv.Constant 896 : i32
        %68 = spv.IMul %19, %cst896_i32_26 : i32
        %69 = spv.IAdd %31, %68 : i32
        %cst896_i32_27 = spv.Constant 896 : i32
        %70 = spv.IMul %14, %cst896_i32_27 : i32
        %71 = spv.IAdd %69, %70 : i32
        %cst8_i32_28 = spv.Constant 8 : i32
        %72 = spv.IMul %6, %cst8_i32_28 : i32
        %73 = spv.IAdd %71, %72 : i32
        %cst8_i32_29 = spv.Constant 8 : i32
        %74 = spv.IMul %15, %cst8_i32_29 : i32
        %75 = spv.IAdd %73, %74 : i32
        %cst0_i32_30 = spv.Constant 0 : i32
        %cst0_i32_31 = spv.Constant 0 : i32
        %cst1_i32_32 = spv.Constant 1 : i32
        %76 = spv.IMul %cst1_i32_32, %75 : i32
        %77 = spv.IAdd %cst0_i32_31, %76 : i32
        %78 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_30, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %78, %29 : vector<4xf32>
        %79 = spv.IAdd %19, %cst112_i32 : i32
        spv.Branch ^bb1(%79 : i32)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      spv.Return
    }
  }
  hal.interface private @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// -----// IR Dump After SPIRVLowerABIAttributes //----- //
spv.module Logical GLSL450 {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %cst100352_i32 = spv.Constant 100352 : i32
    %cst216_i32 = spv.Constant 216 : i32
    %cst151875_i32 = spv.Constant 151875 : i32
    %cst1_i32 = spv.Constant 1 : i32
    %cst3_i32 = spv.Constant 3 : i32
    %cst112_i32 = spv.Constant 112 : i32
    %cst0_i32 = spv.Constant 0 : i32
    %cst32_i32 = spv.Constant 32 : i32
    %cst16_i32 = spv.Constant 16 : i32
    %cst4_i32 = spv.Constant 4 : i32
    %cst2_i32 = spv.Constant 2 : i32
    %cst8_i32 = spv.Constant 8 : i32
    %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
    %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_WorkgroupId___addr_0 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
    %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
    %__builtin_var_WorkgroupId___addr_1 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_1 : vector<3xi32>
    %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
    %6 = spv.IMul %3, %cst8_i32 : i32
    %7 = spv.IMul %1, %cst32_i32 : i32
    %8 = spv.IMul %3, %cst16_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %__builtin_var_LocalInvocationId___addr_2 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_2 : vector<3xi32>
    %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
    %__builtin_var_LocalInvocationId___addr_3 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_3 : vector<3xi32>
    %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
    %15 = spv.IMul %12, %cst4_i32 : i32
    %16 = spv.IMul %10, %cst4_i32 : i32
    %17 = spv.IMul %14, %cst2_i32 : i32
    %18 = spv.IMul %12, %cst8_i32 : i32
    spv.mlir.loop {
      spv.Branch ^bb1(%5 : i32)
    ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
      %20 = spv.SLessThan %19, %cst112_i32 : i32
      spv.BranchConditional %20, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %21 = spv.IMul %19, %cst2_i32 : i32
      %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%80: i32, %81: vector<4xf32>, %82: vector<4xf32>, %83: vector<4xf32>, %84: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %85 = spv.SLessThan %80, %cst3_i32 : i32
        spv.BranchConditional %85, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %86 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %87 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %88 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %89 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        spv.mlir.loop {
          spv.Branch ^bb1(%cst0_i32, %81, %82, %83, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb1(%95: i32, %96: vector<4xf32>, %97: vector<4xf32>, %98: vector<4xf32>, %99: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
          %100 = spv.SLessThan %95, %cst3_i32 : i32
          spv.BranchConditional %100, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %101 = spv.IAdd %7, %16 : i32
          %102 = spv.SDiv %101, %cst4_i32 : i32
          %cst72_i32 = spv.Constant 72 : i32
          %103 = spv.IMul %80, %cst72_i32 : i32
          %cst24_i32_33 = spv.Constant 24 : i32
          %104 = spv.IMul %95, %cst24_i32_33 : i32
          %105 = spv.IAdd %103, %104 : i32
          %106 = spv.IAdd %105, %102 : i32
          %cst0_i32_34 = spv.Constant 0 : i32
          %cst0_i32_35 = spv.Constant 0 : i32
          %cst1_i32_36 = spv.Constant 1 : i32
          %107 = spv.IMul %cst1_i32_36, %106 : i32
          %108 = spv.IAdd %cst0_i32_35, %107 : i32
          %109 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_34, %108] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
          %cst72_i32_37 = spv.Constant 72 : i32
          %111 = spv.IMul %80, %cst72_i32_37 : i32
          %cst24_i32_38 = spv.Constant 24 : i32
          %112 = spv.IMul %95, %cst24_i32_38 : i32
          %113 = spv.IAdd %111, %112 : i32
          %114 = spv.IAdd %113, %102 : i32
          %cst8_i32_39 = spv.Constant 8 : i32
          %115 = spv.IAdd %114, %cst8_i32_39 : i32
          %cst0_i32_40 = spv.Constant 0 : i32
          %cst0_i32_41 = spv.Constant 0 : i32
          %cst1_i32_42 = spv.Constant 1 : i32
          %116 = spv.IMul %cst1_i32_42, %115 : i32
          %117 = spv.IAdd %cst0_i32_41, %116 : i32
          %118 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_40, %117] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %119 = spv.Load "StorageBuffer" %118 : vector<4xf32>
          %cst72_i32_43 = spv.Constant 72 : i32
          %120 = spv.IMul %80, %cst72_i32_43 : i32
          %cst24_i32_44 = spv.Constant 24 : i32
          %121 = spv.IMul %95, %cst24_i32_44 : i32
          %122 = spv.IAdd %120, %121 : i32
          %123 = spv.IAdd %122, %102 : i32
          %cst16_i32_45 = spv.Constant 16 : i32
          %124 = spv.IAdd %123, %cst16_i32_45 : i32
          %cst0_i32_46 = spv.Constant 0 : i32
          %cst0_i32_47 = spv.Constant 0 : i32
          %cst1_i32_48 = spv.Constant 1 : i32
          %125 = spv.IMul %cst1_i32_48, %124 : i32
          %126 = spv.IAdd %cst0_i32_47, %125 : i32
          %127 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_46, %126] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %128 = spv.Load "StorageBuffer" %127 : vector<4xf32>
          %cst675_i32 = spv.Constant 675 : i32
          %129 = spv.IMul %80, %cst675_i32 : i32
          %cst3_i32_49 = spv.Constant 3 : i32
          %130 = spv.IMul %95, %cst3_i32_49 : i32
          %131 = spv.IAdd %129, %130 : i32
          %cst675_i32_50 = spv.Constant 675 : i32
          %132 = spv.IMul %21, %cst675_i32_50 : i32
          %133 = spv.IAdd %131, %132 : i32
          %cst675_i32_51 = spv.Constant 675 : i32
          %134 = spv.IMul %17, %cst675_i32_51 : i32
          %135 = spv.IAdd %133, %134 : i32
          %cst3_i32_52 = spv.Constant 3 : i32
          %136 = spv.IMul %8, %cst3_i32_52 : i32
          %137 = spv.IAdd %135, %136 : i32
          %cst3_i32_53 = spv.Constant 3 : i32
          %138 = spv.IMul %18, %cst3_i32_53 : i32
          %139 = spv.IAdd %137, %138 : i32
          %cst0_i32_54 = spv.Constant 0 : i32
          %cst0_i32_55 = spv.Constant 0 : i32
          %cst1_i32_56 = spv.Constant 1 : i32
          %140 = spv.IMul %cst1_i32_56, %139 : i32
          %141 = spv.IAdd %cst0_i32_55, %140 : i32
          %142 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_54, %141] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %143 = spv.Load "StorageBuffer" %142 : f32
          %144 = spv.CompositeInsert %143, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %cst675_i32_57 = spv.Constant 675 : i32
          %145 = spv.IMul %80, %cst675_i32_57 : i32
          %cst3_i32_58 = spv.Constant 3 : i32
          %146 = spv.IMul %95, %cst3_i32_58 : i32
          %147 = spv.IAdd %145, %146 : i32
          %cst675_i32_59 = spv.Constant 675 : i32
          %148 = spv.IMul %21, %cst675_i32_59 : i32
          %149 = spv.IAdd %147, %148 : i32
          %cst675_i32_60 = spv.Constant 675 : i32
          %150 = spv.IMul %17, %cst675_i32_60 : i32
          %151 = spv.IAdd %149, %150 : i32
          %cst3_i32_61 = spv.Constant 3 : i32
          %152 = spv.IMul %8, %cst3_i32_61 : i32
          %153 = spv.IAdd %151, %152 : i32
          %cst3_i32_62 = spv.Constant 3 : i32
          %154 = spv.IMul %18, %cst3_i32_62 : i32
          %155 = spv.IAdd %153, %154 : i32
          %cst1_i32_63 = spv.Constant 1 : i32
          %156 = spv.IAdd %155, %cst1_i32_63 : i32
          %cst0_i32_64 = spv.Constant 0 : i32
          %cst0_i32_65 = spv.Constant 0 : i32
          %cst1_i32_66 = spv.Constant 1 : i32
          %157 = spv.IMul %cst1_i32_66, %156 : i32
          %158 = spv.IAdd %cst0_i32_65, %157 : i32
          %159 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_64, %158] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %160 = spv.Load "StorageBuffer" %159 : f32
          %161 = spv.CompositeInsert %160, %144[1 : i32] : f32 into vector<3xf32>
          %cst675_i32_67 = spv.Constant 675 : i32
          %162 = spv.IMul %80, %cst675_i32_67 : i32
          %cst3_i32_68 = spv.Constant 3 : i32
          %163 = spv.IMul %95, %cst3_i32_68 : i32
          %164 = spv.IAdd %162, %163 : i32
          %cst675_i32_69 = spv.Constant 675 : i32
          %165 = spv.IMul %21, %cst675_i32_69 : i32
          %166 = spv.IAdd %164, %165 : i32
          %cst675_i32_70 = spv.Constant 675 : i32
          %167 = spv.IMul %17, %cst675_i32_70 : i32
          %168 = spv.IAdd %166, %167 : i32
          %cst3_i32_71 = spv.Constant 3 : i32
          %169 = spv.IMul %8, %cst3_i32_71 : i32
          %170 = spv.IAdd %168, %169 : i32
          %cst3_i32_72 = spv.Constant 3 : i32
          %171 = spv.IMul %18, %cst3_i32_72 : i32
          %172 = spv.IAdd %170, %171 : i32
          %cst2_i32_73 = spv.Constant 2 : i32
          %173 = spv.IAdd %172, %cst2_i32_73 : i32
          %cst0_i32_74 = spv.Constant 0 : i32
          %cst0_i32_75 = spv.Constant 0 : i32
          %cst1_i32_76 = spv.Constant 1 : i32
          %174 = spv.IMul %cst1_i32_76, %173 : i32
          %175 = spv.IAdd %cst0_i32_75, %174 : i32
          %176 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_74, %175] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %177 = spv.Load "StorageBuffer" %176 : f32
          %178 = spv.CompositeInsert %177, %161[2 : i32] : f32 into vector<3xf32>
          %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
          %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
          %181 = spv.GLSL.Fma %180, %110, %96 : vector<4xf32>
          %182 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
          %183 = spv.CompositeConstruct %182, %182, %182, %182 : vector<4xf32>
          %184 = spv.GLSL.Fma %183, %119, %181 : vector<4xf32>
          %185 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
          %186 = spv.CompositeConstruct %185, %185, %185, %185 : vector<4xf32>
          %187 = spv.GLSL.Fma %186, %128, %184 : vector<4xf32>
          %cst675_i32_77 = spv.Constant 675 : i32
          %188 = spv.IMul %80, %cst675_i32_77 : i32
          %cst675_i32_78 = spv.Constant 675 : i32
          %189 = spv.IMul %21, %cst675_i32_78 : i32
          %190 = spv.IAdd %188, %189 : i32
          %cst675_i32_79 = spv.Constant 675 : i32
          %191 = spv.IMul %17, %cst675_i32_79 : i32
          %192 = spv.IAdd %190, %191 : i32
          %cst3_i32_80 = spv.Constant 3 : i32
          %193 = spv.IMul %8, %cst3_i32_80 : i32
          %194 = spv.IAdd %192, %193 : i32
          %cst3_i32_81 = spv.Constant 3 : i32
          %195 = spv.IMul %18, %cst3_i32_81 : i32
          %196 = spv.IAdd %194, %195 : i32
          %cst3_i32_82 = spv.Constant 3 : i32
          %197 = spv.IMul %95, %cst3_i32_82 : i32
          %198 = spv.IAdd %196, %197 : i32
          %cst6_i32 = spv.Constant 6 : i32
          %199 = spv.IAdd %198, %cst6_i32 : i32
          %cst0_i32_83 = spv.Constant 0 : i32
          %cst0_i32_84 = spv.Constant 0 : i32
          %cst1_i32_85 = spv.Constant 1 : i32
          %200 = spv.IMul %cst1_i32_85, %199 : i32
          %201 = spv.IAdd %cst0_i32_84, %200 : i32
          %202 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_83, %201] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %203 = spv.Load "StorageBuffer" %202 : f32
          %204 = spv.CompositeInsert %203, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %cst675_i32_86 = spv.Constant 675 : i32
          %205 = spv.IMul %80, %cst675_i32_86 : i32
          %cst675_i32_87 = spv.Constant 675 : i32
          %206 = spv.IMul %21, %cst675_i32_87 : i32
          %207 = spv.IAdd %205, %206 : i32
          %cst675_i32_88 = spv.Constant 675 : i32
          %208 = spv.IMul %17, %cst675_i32_88 : i32
          %209 = spv.IAdd %207, %208 : i32
          %cst3_i32_89 = spv.Constant 3 : i32
          %210 = spv.IMul %8, %cst3_i32_89 : i32
          %211 = spv.IAdd %209, %210 : i32
          %cst3_i32_90 = spv.Constant 3 : i32
          %212 = spv.IMul %18, %cst3_i32_90 : i32
          %213 = spv.IAdd %211, %212 : i32
          %cst3_i32_91 = spv.Constant 3 : i32
          %214 = spv.IMul %95, %cst3_i32_91 : i32
          %215 = spv.IAdd %213, %214 : i32
          %cst7_i32 = spv.Constant 7 : i32
          %216 = spv.IAdd %215, %cst7_i32 : i32
          %cst0_i32_92 = spv.Constant 0 : i32
          %cst0_i32_93 = spv.Constant 0 : i32
          %cst1_i32_94 = spv.Constant 1 : i32
          %217 = spv.IMul %cst1_i32_94, %216 : i32
          %218 = spv.IAdd %cst0_i32_93, %217 : i32
          %219 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_92, %218] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %220 = spv.Load "StorageBuffer" %219 : f32
          %221 = spv.CompositeInsert %220, %204[1 : i32] : f32 into vector<3xf32>
          %cst675_i32_95 = spv.Constant 675 : i32
          %222 = spv.IMul %80, %cst675_i32_95 : i32
          %cst675_i32_96 = spv.Constant 675 : i32
          %223 = spv.IMul %21, %cst675_i32_96 : i32
          %224 = spv.IAdd %222, %223 : i32
          %cst675_i32_97 = spv.Constant 675 : i32
          %225 = spv.IMul %17, %cst675_i32_97 : i32
          %226 = spv.IAdd %224, %225 : i32
          %cst3_i32_98 = spv.Constant 3 : i32
          %227 = spv.IMul %8, %cst3_i32_98 : i32
          %228 = spv.IAdd %226, %227 : i32
          %cst3_i32_99 = spv.Constant 3 : i32
          %229 = spv.IMul %18, %cst3_i32_99 : i32
          %230 = spv.IAdd %228, %229 : i32
          %cst3_i32_100 = spv.Constant 3 : i32
          %231 = spv.IMul %95, %cst3_i32_100 : i32
          %232 = spv.IAdd %230, %231 : i32
          %cst8_i32_101 = spv.Constant 8 : i32
          %233 = spv.IAdd %232, %cst8_i32_101 : i32
          %cst0_i32_102 = spv.Constant 0 : i32
          %cst0_i32_103 = spv.Constant 0 : i32
          %cst1_i32_104 = spv.Constant 1 : i32
          %234 = spv.IMul %cst1_i32_104, %233 : i32
          %235 = spv.IAdd %cst0_i32_103, %234 : i32
          %236 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_102, %235] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %237 = spv.Load "StorageBuffer" %236 : f32
          %238 = spv.CompositeInsert %237, %221[2 : i32] : f32 into vector<3xf32>
          %239 = spv.CompositeExtract %238[0 : i32] : vector<3xf32>
          %240 = spv.CompositeConstruct %239, %239, %239, %239 : vector<4xf32>
          %241 = spv.GLSL.Fma %240, %110, %97 : vector<4xf32>
          %242 = spv.CompositeExtract %238[1 : i32] : vector<3xf32>
          %243 = spv.CompositeConstruct %242, %242, %242, %242 : vector<4xf32>
          %244 = spv.GLSL.Fma %243, %119, %241 : vector<4xf32>
          %245 = spv.CompositeExtract %238[2 : i32] : vector<3xf32>
          %246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
          %247 = spv.GLSL.Fma %246, %128, %244 : vector<4xf32>
          %cst675_i32_105 = spv.Constant 675 : i32
          %248 = spv.IMul %80, %cst675_i32_105 : i32
          %cst675_i32_106 = spv.Constant 675 : i32
          %249 = spv.IMul %21, %cst675_i32_106 : i32
          %250 = spv.IAdd %248, %249 : i32
          %cst675_i32_107 = spv.Constant 675 : i32
          %251 = spv.IMul %17, %cst675_i32_107 : i32
          %252 = spv.IAdd %250, %251 : i32
          %cst3_i32_108 = spv.Constant 3 : i32
          %253 = spv.IMul %8, %cst3_i32_108 : i32
          %254 = spv.IAdd %252, %253 : i32
          %cst3_i32_109 = spv.Constant 3 : i32
          %255 = spv.IMul %18, %cst3_i32_109 : i32
          %256 = spv.IAdd %254, %255 : i32
          %cst3_i32_110 = spv.Constant 3 : i32
          %257 = spv.IMul %95, %cst3_i32_110 : i32
          %258 = spv.IAdd %256, %257 : i32
          %cst12_i32 = spv.Constant 12 : i32
          %259 = spv.IAdd %258, %cst12_i32 : i32
          %cst0_i32_111 = spv.Constant 0 : i32
          %cst0_i32_112 = spv.Constant 0 : i32
          %cst1_i32_113 = spv.Constant 1 : i32
          %260 = spv.IMul %cst1_i32_113, %259 : i32
          %261 = spv.IAdd %cst0_i32_112, %260 : i32
          %262 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_111, %261] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %263 = spv.Load "StorageBuffer" %262 : f32
          %264 = spv.CompositeInsert %263, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %cst675_i32_114 = spv.Constant 675 : i32
          %265 = spv.IMul %80, %cst675_i32_114 : i32
          %cst675_i32_115 = spv.Constant 675 : i32
          %266 = spv.IMul %21, %cst675_i32_115 : i32
          %267 = spv.IAdd %265, %266 : i32
          %cst675_i32_116 = spv.Constant 675 : i32
          %268 = spv.IMul %17, %cst675_i32_116 : i32
          %269 = spv.IAdd %267, %268 : i32
          %cst3_i32_117 = spv.Constant 3 : i32
          %270 = spv.IMul %8, %cst3_i32_117 : i32
          %271 = spv.IAdd %269, %270 : i32
          %cst3_i32_118 = spv.Constant 3 : i32
          %272 = spv.IMul %18, %cst3_i32_118 : i32
          %273 = spv.IAdd %271, %272 : i32
          %cst3_i32_119 = spv.Constant 3 : i32
          %274 = spv.IMul %95, %cst3_i32_119 : i32
          %275 = spv.IAdd %273, %274 : i32
          %cst13_i32 = spv.Constant 13 : i32
          %276 = spv.IAdd %275, %cst13_i32 : i32
          %cst0_i32_120 = spv.Constant 0 : i32
          %cst0_i32_121 = spv.Constant 0 : i32
          %cst1_i32_122 = spv.Constant 1 : i32
          %277 = spv.IMul %cst1_i32_122, %276 : i32
          %278 = spv.IAdd %cst0_i32_121, %277 : i32
          %279 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_120, %278] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %280 = spv.Load "StorageBuffer" %279 : f32
          %281 = spv.CompositeInsert %280, %264[1 : i32] : f32 into vector<3xf32>
          %cst675_i32_123 = spv.Constant 675 : i32
          %282 = spv.IMul %80, %cst675_i32_123 : i32
          %cst675_i32_124 = spv.Constant 675 : i32
          %283 = spv.IMul %21, %cst675_i32_124 : i32
          %284 = spv.IAdd %282, %283 : i32
          %cst675_i32_125 = spv.Constant 675 : i32
          %285 = spv.IMul %17, %cst675_i32_125 : i32
          %286 = spv.IAdd %284, %285 : i32
          %cst3_i32_126 = spv.Constant 3 : i32
          %287 = spv.IMul %8, %cst3_i32_126 : i32
          %288 = spv.IAdd %286, %287 : i32
          %cst3_i32_127 = spv.Constant 3 : i32
          %289 = spv.IMul %18, %cst3_i32_127 : i32
          %290 = spv.IAdd %288, %289 : i32
          %cst3_i32_128 = spv.Constant 3 : i32
          %291 = spv.IMul %95, %cst3_i32_128 : i32
          %292 = spv.IAdd %290, %291 : i32
          %cst14_i32 = spv.Constant 14 : i32
          %293 = spv.IAdd %292, %cst14_i32 : i32
          %cst0_i32_129 = spv.Constant 0 : i32
          %cst0_i32_130 = spv.Constant 0 : i32
          %cst1_i32_131 = spv.Constant 1 : i32
          %294 = spv.IMul %cst1_i32_131, %293 : i32
          %295 = spv.IAdd %cst0_i32_130, %294 : i32
          %296 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_129, %295] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %297 = spv.Load "StorageBuffer" %296 : f32
          %298 = spv.CompositeInsert %297, %281[2 : i32] : f32 into vector<3xf32>
          %299 = spv.CompositeExtract %298[0 : i32] : vector<3xf32>
          %300 = spv.CompositeConstruct %299, %299, %299, %299 : vector<4xf32>
          %301 = spv.GLSL.Fma %300, %110, %98 : vector<4xf32>
          %302 = spv.CompositeExtract %298[1 : i32] : vector<3xf32>
          %303 = spv.CompositeConstruct %302, %302, %302, %302 : vector<4xf32>
          %304 = spv.GLSL.Fma %303, %119, %301 : vector<4xf32>
          %305 = spv.CompositeExtract %298[2 : i32] : vector<3xf32>
          %306 = spv.CompositeConstruct %305, %305, %305, %305 : vector<4xf32>
          %307 = spv.GLSL.Fma %306, %128, %304 : vector<4xf32>
          %cst675_i32_132 = spv.Constant 675 : i32
          %308 = spv.IMul %80, %cst675_i32_132 : i32
          %cst675_i32_133 = spv.Constant 675 : i32
          %309 = spv.IMul %21, %cst675_i32_133 : i32
          %310 = spv.IAdd %308, %309 : i32
          %cst675_i32_134 = spv.Constant 675 : i32
          %311 = spv.IMul %17, %cst675_i32_134 : i32
          %312 = spv.IAdd %310, %311 : i32
          %cst3_i32_135 = spv.Constant 3 : i32
          %313 = spv.IMul %8, %cst3_i32_135 : i32
          %314 = spv.IAdd %312, %313 : i32
          %cst3_i32_136 = spv.Constant 3 : i32
          %315 = spv.IMul %18, %cst3_i32_136 : i32
          %316 = spv.IAdd %314, %315 : i32
          %cst3_i32_137 = spv.Constant 3 : i32
          %317 = spv.IMul %95, %cst3_i32_137 : i32
          %318 = spv.IAdd %316, %317 : i32
          %cst18_i32 = spv.Constant 18 : i32
          %319 = spv.IAdd %318, %cst18_i32 : i32
          %cst0_i32_138 = spv.Constant 0 : i32
          %cst0_i32_139 = spv.Constant 0 : i32
          %cst1_i32_140 = spv.Constant 1 : i32
          %320 = spv.IMul %cst1_i32_140, %319 : i32
          %321 = spv.IAdd %cst0_i32_139, %320 : i32
          %322 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_138, %321] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %323 = spv.Load "StorageBuffer" %322 : f32
          %324 = spv.CompositeInsert %323, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %cst675_i32_141 = spv.Constant 675 : i32
          %325 = spv.IMul %80, %cst675_i32_141 : i32
          %cst675_i32_142 = spv.Constant 675 : i32
          %326 = spv.IMul %21, %cst675_i32_142 : i32
          %327 = spv.IAdd %325, %326 : i32
          %cst675_i32_143 = spv.Constant 675 : i32
          %328 = spv.IMul %17, %cst675_i32_143 : i32
          %329 = spv.IAdd %327, %328 : i32
          %cst3_i32_144 = spv.Constant 3 : i32
          %330 = spv.IMul %8, %cst3_i32_144 : i32
          %331 = spv.IAdd %329, %330 : i32
          %cst3_i32_145 = spv.Constant 3 : i32
          %332 = spv.IMul %18, %cst3_i32_145 : i32
          %333 = spv.IAdd %331, %332 : i32
          %cst3_i32_146 = spv.Constant 3 : i32
          %334 = spv.IMul %95, %cst3_i32_146 : i32
          %335 = spv.IAdd %333, %334 : i32
          %cst19_i32 = spv.Constant 19 : i32
          %336 = spv.IAdd %335, %cst19_i32 : i32
          %cst0_i32_147 = spv.Constant 0 : i32
          %cst0_i32_148 = spv.Constant 0 : i32
          %cst1_i32_149 = spv.Constant 1 : i32
          %337 = spv.IMul %cst1_i32_149, %336 : i32
          %338 = spv.IAdd %cst0_i32_148, %337 : i32
          %339 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_147, %338] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %340 = spv.Load "StorageBuffer" %339 : f32
          %341 = spv.CompositeInsert %340, %324[1 : i32] : f32 into vector<3xf32>
          %cst675_i32_150 = spv.Constant 675 : i32
          %342 = spv.IMul %80, %cst675_i32_150 : i32
          %cst675_i32_151 = spv.Constant 675 : i32
          %343 = spv.IMul %21, %cst675_i32_151 : i32
          %344 = spv.IAdd %342, %343 : i32
          %cst675_i32_152 = spv.Constant 675 : i32
          %345 = spv.IMul %17, %cst675_i32_152 : i32
          %346 = spv.IAdd %344, %345 : i32
          %cst3_i32_153 = spv.Constant 3 : i32
          %347 = spv.IMul %8, %cst3_i32_153 : i32
          %348 = spv.IAdd %346, %347 : i32
          %cst3_i32_154 = spv.Constant 3 : i32
          %349 = spv.IMul %18, %cst3_i32_154 : i32
          %350 = spv.IAdd %348, %349 : i32
          %cst3_i32_155 = spv.Constant 3 : i32
          %351 = spv.IMul %95, %cst3_i32_155 : i32
          %352 = spv.IAdd %350, %351 : i32
          %cst20_i32 = spv.Constant 20 : i32
          %353 = spv.IAdd %352, %cst20_i32 : i32
          %cst0_i32_156 = spv.Constant 0 : i32
          %cst0_i32_157 = spv.Constant 0 : i32
          %cst1_i32_158 = spv.Constant 1 : i32
          %354 = spv.IMul %cst1_i32_158, %353 : i32
          %355 = spv.IAdd %cst0_i32_157, %354 : i32
          %356 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_156, %355] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %357 = spv.Load "StorageBuffer" %356 : f32
          %358 = spv.CompositeInsert %357, %341[2 : i32] : f32 into vector<3xf32>
          %359 = spv.CompositeExtract %358[0 : i32] : vector<3xf32>
          %360 = spv.CompositeConstruct %359, %359, %359, %359 : vector<4xf32>
          %361 = spv.GLSL.Fma %360, %110, %99 : vector<4xf32>
          %362 = spv.CompositeExtract %358[1 : i32] : vector<3xf32>
          %363 = spv.CompositeConstruct %362, %362, %362, %362 : vector<4xf32>
          %364 = spv.GLSL.Fma %363, %119, %361 : vector<4xf32>
          %365 = spv.CompositeExtract %358[2 : i32] : vector<3xf32>
          %366 = spv.CompositeConstruct %365, %365, %365, %365 : vector<4xf32>
          %367 = spv.GLSL.Fma %366, %128, %364 : vector<4xf32>
          spv.Store "Function" %86, %187 : vector<4xf32>
          spv.Store "Function" %87, %247 : vector<4xf32>
          spv.Store "Function" %88, %307 : vector<4xf32>
          spv.Store "Function" %89, %367 : vector<4xf32>
          %368 = spv.IAdd %95, %cst1_i32 : i32
          spv.Branch ^bb1(%368, %187, %247, %307, %367 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        %90 = spv.Load "Function" %89 : vector<4xf32>
        %91 = spv.Load "Function" %88 : vector<4xf32>
        %92 = spv.Load "Function" %87 : vector<4xf32>
        %93 = spv.Load "Function" %86 : vector<4xf32>
        spv.Store "Function" %22, %93 : vector<4xf32>
        spv.Store "Function" %23, %92 : vector<4xf32>
        spv.Store "Function" %24, %91 : vector<4xf32>
        spv.Store "Function" %25, %90 : vector<4xf32>
        %94 = spv.IAdd %80, %cst1_i32 : i32
        spv.Branch ^bb1(%94, %93, %92, %91, %90 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %26 = spv.Load "Function" %25 : vector<4xf32>
      %27 = spv.Load "Function" %24 : vector<4xf32>
      %28 = spv.Load "Function" %23 : vector<4xf32>
      %29 = spv.Load "Function" %22 : vector<4xf32>
      %30 = spv.IAdd %7, %16 : i32
      %31 = spv.SDiv %30, %cst4_i32 : i32
      %cst896_i32 = spv.Constant 896 : i32
      %32 = spv.IMul %19, %cst896_i32 : i32
      %33 = spv.IAdd %31, %32 : i32
      %cst896_i32_4 = spv.Constant 896 : i32
      %34 = spv.IMul %14, %cst896_i32_4 : i32
      %35 = spv.IAdd %33, %34 : i32
      %cst8_i32_5 = spv.Constant 8 : i32
      %36 = spv.IMul %6, %cst8_i32_5 : i32
      %37 = spv.IAdd %35, %36 : i32
      %cst8_i32_6 = spv.Constant 8 : i32
      %38 = spv.IMul %15, %cst8_i32_6 : i32
      %39 = spv.IAdd %37, %38 : i32
      %cst24_i32 = spv.Constant 24 : i32
      %40 = spv.IAdd %39, %cst24_i32 : i32
      %cst0_i32_7 = spv.Constant 0 : i32
      %cst0_i32_8 = spv.Constant 0 : i32
      %cst1_i32_9 = spv.Constant 1 : i32
      %41 = spv.IMul %cst1_i32_9, %40 : i32
      %42 = spv.IAdd %cst0_i32_8, %41 : i32
      %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_7, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %43, %26 : vector<4xf32>
      %cst896_i32_10 = spv.Constant 896 : i32
      %44 = spv.IMul %19, %cst896_i32_10 : i32
      %45 = spv.IAdd %31, %44 : i32
      %cst896_i32_11 = spv.Constant 896 : i32
      %46 = spv.IMul %14, %cst896_i32_11 : i32
      %47 = spv.IAdd %45, %46 : i32
      %cst8_i32_12 = spv.Constant 8 : i32
      %48 = spv.IMul %6, %cst8_i32_12 : i32
      %49 = spv.IAdd %47, %48 : i32
      %cst8_i32_13 = spv.Constant 8 : i32
      %50 = spv.IMul %15, %cst8_i32_13 : i32
      %51 = spv.IAdd %49, %50 : i32
      %cst16_i32_14 = spv.Constant 16 : i32
      %52 = spv.IAdd %51, %cst16_i32_14 : i32
      %cst0_i32_15 = spv.Constant 0 : i32
      %cst0_i32_16 = spv.Constant 0 : i32
      %cst1_i32_17 = spv.Constant 1 : i32
      %53 = spv.IMul %cst1_i32_17, %52 : i32
      %54 = spv.IAdd %cst0_i32_16, %53 : i32
      %55 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_15, %54] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %55, %27 : vector<4xf32>
      %cst896_i32_18 = spv.Constant 896 : i32
      %56 = spv.IMul %19, %cst896_i32_18 : i32
      %57 = spv.IAdd %31, %56 : i32
      %cst896_i32_19 = spv.Constant 896 : i32
      %58 = spv.IMul %14, %cst896_i32_19 : i32
      %59 = spv.IAdd %57, %58 : i32
      %cst8_i32_20 = spv.Constant 8 : i32
      %60 = spv.IMul %6, %cst8_i32_20 : i32
      %61 = spv.IAdd %59, %60 : i32
      %cst8_i32_21 = spv.Constant 8 : i32
      %62 = spv.IMul %15, %cst8_i32_21 : i32
      %63 = spv.IAdd %61, %62 : i32
      %cst8_i32_22 = spv.Constant 8 : i32
      %64 = spv.IAdd %63, %cst8_i32_22 : i32
      %cst0_i32_23 = spv.Constant 0 : i32
      %cst0_i32_24 = spv.Constant 0 : i32
      %cst1_i32_25 = spv.Constant 1 : i32
      %65 = spv.IMul %cst1_i32_25, %64 : i32
      %66 = spv.IAdd %cst0_i32_24, %65 : i32
      %67 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_23, %66] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %67, %28 : vector<4xf32>
      %cst896_i32_26 = spv.Constant 896 : i32
      %68 = spv.IMul %19, %cst896_i32_26 : i32
      %69 = spv.IAdd %31, %68 : i32
      %cst896_i32_27 = spv.Constant 896 : i32
      %70 = spv.IMul %14, %cst896_i32_27 : i32
      %71 = spv.IAdd %69, %70 : i32
      %cst8_i32_28 = spv.Constant 8 : i32
      %72 = spv.IMul %6, %cst8_i32_28 : i32
      %73 = spv.IAdd %71, %72 : i32
      %cst8_i32_29 = spv.Constant 8 : i32
      %74 = spv.IMul %15, %cst8_i32_29 : i32
      %75 = spv.IAdd %73, %74 : i32
      %cst0_i32_30 = spv.Constant 0 : i32
      %cst0_i32_31 = spv.Constant 0 : i32
      %cst1_i32_32 = spv.Constant 1 : i32
      %76 = spv.IMul %cst1_i32_32, %75 : i32
      %77 = spv.IAdd %cst0_i32_31, %76 : i32
      %78 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_30, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %78, %29 : vector<4xf32>
      %79 = spv.IAdd %19, %cst112_i32 : i32
      spv.Branch ^bb1(%79 : i32)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}

// -----// IR Dump After Canonicalizer //----- //
spv.module Logical GLSL450 {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %cst896_i32 = spv.Constant 896 : i32
    %cst20_i32 = spv.Constant 20 : i32
    %cst19_i32 = spv.Constant 19 : i32
    %cst18_i32 = spv.Constant 18 : i32
    %cst14_i32 = spv.Constant 14 : i32
    %cst13_i32 = spv.Constant 13 : i32
    %cst12_i32 = spv.Constant 12 : i32
    %cst7_i32 = spv.Constant 7 : i32
    %cst6_i32 = spv.Constant 6 : i32
    %cst675_i32 = spv.Constant 675 : i32
    %cst24_i32 = spv.Constant 24 : i32
    %cst72_i32 = spv.Constant 72 : i32
    %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
    %cst8_i32 = spv.Constant 8 : i32
    %cst2_i32 = spv.Constant 2 : i32
    %cst4_i32 = spv.Constant 4 : i32
    %cst16_i32 = spv.Constant 16 : i32
    %cst32_i32 = spv.Constant 32 : i32
    %cst0_i32 = spv.Constant 0 : i32
    %cst112_i32 = spv.Constant 112 : i32
    %cst3_i32 = spv.Constant 3 : i32
    %cst1_i32 = spv.Constant 1 : i32
    %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %__builtin_var_WorkgroupId___addr_0 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
    %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
    %__builtin_var_WorkgroupId___addr_1 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_1 : vector<3xi32>
    %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
    %6 = spv.IMul %3, %cst8_i32 : i32
    %7 = spv.IMul %1, %cst32_i32 : i32
    %8 = spv.IMul %3, %cst16_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %__builtin_var_LocalInvocationId___addr_2 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_2 : vector<3xi32>
    %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
    %__builtin_var_LocalInvocationId___addr_3 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_3 : vector<3xi32>
    %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
    %15 = spv.IMul %12, %cst4_i32 : i32
    %16 = spv.IMul %10, %cst4_i32 : i32
    %17 = spv.IMul %14, %cst2_i32 : i32
    %18 = spv.IMul %12, %cst8_i32 : i32
    spv.mlir.loop {
      spv.Branch ^bb1(%5 : i32)
    ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
      %20 = spv.SLessThan %19, %cst112_i32 : i32
      spv.BranchConditional %20, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %21 = spv.IMul %19, %cst2_i32 : i32
      %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%72: i32, %73: vector<4xf32>, %74: vector<4xf32>, %75: vector<4xf32>, %76: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %77 = spv.SLessThan %72, %cst3_i32 : i32
        spv.BranchConditional %77, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %78 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %79 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        spv.mlir.loop {
          spv.Branch ^bb1(%cst0_i32, %73, %74, %75, %76 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb1(%87: i32, %88: vector<4xf32>, %89: vector<4xf32>, %90: vector<4xf32>, %91: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
          %92 = spv.SLessThan %87, %cst3_i32 : i32
          spv.BranchConditional %92, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %93 = spv.IAdd %7, %16 : i32
          %94 = spv.SDiv %93, %cst4_i32 : i32
          %95 = spv.IMul %72, %cst72_i32 : i32
          %96 = spv.IMul %87, %cst24_i32 : i32
          %97 = spv.IAdd %95, %96 : i32
          %98 = spv.IAdd %97, %94 : i32
          %99 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %98] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %100 = spv.Load "StorageBuffer" %99 : vector<4xf32>
          %101 = spv.IMul %72, %cst72_i32 : i32
          %102 = spv.IMul %87, %cst24_i32 : i32
          %103 = spv.IAdd %101, %102 : i32
          %104 = spv.IAdd %103, %94 : i32
          %105 = spv.IAdd %104, %cst8_i32 : i32
          %106 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %105] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %107 = spv.Load "StorageBuffer" %106 : vector<4xf32>
          %108 = spv.IMul %72, %cst72_i32 : i32
          %109 = spv.IMul %87, %cst24_i32 : i32
          %110 = spv.IAdd %108, %109 : i32
          %111 = spv.IAdd %110, %94 : i32
          %112 = spv.IAdd %111, %cst16_i32 : i32
          %113 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %112] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %114 = spv.Load "StorageBuffer" %113 : vector<4xf32>
          %115 = spv.IMul %72, %cst675_i32 : i32
          %116 = spv.IMul %87, %cst3_i32 : i32
          %117 = spv.IAdd %115, %116 : i32
          %118 = spv.IMul %21, %cst675_i32 : i32
          %119 = spv.IAdd %117, %118 : i32
          %120 = spv.IMul %17, %cst675_i32 : i32
          %121 = spv.IAdd %119, %120 : i32
          %122 = spv.IMul %8, %cst3_i32 : i32
          %123 = spv.IAdd %121, %122 : i32
          %124 = spv.IMul %18, %cst3_i32 : i32
          %125 = spv.IAdd %123, %124 : i32
          %126 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %125] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %127 = spv.Load "StorageBuffer" %126 : f32
          %128 = spv.CompositeInsert %127, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %129 = spv.IMul %72, %cst675_i32 : i32
          %130 = spv.IMul %87, %cst3_i32 : i32
          %131 = spv.IAdd %129, %130 : i32
          %132 = spv.IMul %21, %cst675_i32 : i32
          %133 = spv.IAdd %131, %132 : i32
          %134 = spv.IMul %17, %cst675_i32 : i32
          %135 = spv.IAdd %133, %134 : i32
          %136 = spv.IMul %8, %cst3_i32 : i32
          %137 = spv.IAdd %135, %136 : i32
          %138 = spv.IMul %18, %cst3_i32 : i32
          %139 = spv.IAdd %137, %138 : i32
          %140 = spv.IAdd %139, %cst1_i32 : i32
          %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %142 = spv.Load "StorageBuffer" %141 : f32
          %143 = spv.CompositeInsert %142, %128[1 : i32] : f32 into vector<3xf32>
          %144 = spv.IMul %72, %cst675_i32 : i32
          %145 = spv.IMul %87, %cst3_i32 : i32
          %146 = spv.IAdd %144, %145 : i32
          %147 = spv.IMul %21, %cst675_i32 : i32
          %148 = spv.IAdd %146, %147 : i32
          %149 = spv.IMul %17, %cst675_i32 : i32
          %150 = spv.IAdd %148, %149 : i32
          %151 = spv.IMul %8, %cst3_i32 : i32
          %152 = spv.IAdd %150, %151 : i32
          %153 = spv.IMul %18, %cst3_i32 : i32
          %154 = spv.IAdd %152, %153 : i32
          %155 = spv.IAdd %154, %cst2_i32 : i32
          %156 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %155] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %157 = spv.Load "StorageBuffer" %156 : f32
          %158 = spv.CompositeInsert %157, %143[2 : i32] : f32 into vector<3xf32>
          %159 = spv.CompositeExtract %158[0 : i32] : vector<3xf32>
          %160 = spv.CompositeConstruct %159, %159, %159, %159 : vector<4xf32>
          %161 = spv.GLSL.Fma %160, %100, %88 : vector<4xf32>
          %162 = spv.CompositeExtract %158[1 : i32] : vector<3xf32>
          %163 = spv.CompositeConstruct %162, %162, %162, %162 : vector<4xf32>
          %164 = spv.GLSL.Fma %163, %107, %161 : vector<4xf32>
          %165 = spv.CompositeExtract %158[2 : i32] : vector<3xf32>
          %166 = spv.CompositeConstruct %165, %165, %165, %165 : vector<4xf32>
          %167 = spv.GLSL.Fma %166, %114, %164 : vector<4xf32>
          %168 = spv.IMul %72, %cst675_i32 : i32
          %169 = spv.IMul %21, %cst675_i32 : i32
          %170 = spv.IAdd %168, %169 : i32
          %171 = spv.IMul %17, %cst675_i32 : i32
          %172 = spv.IAdd %170, %171 : i32
          %173 = spv.IMul %8, %cst3_i32 : i32
          %174 = spv.IAdd %172, %173 : i32
          %175 = spv.IMul %18, %cst3_i32 : i32
          %176 = spv.IAdd %174, %175 : i32
          %177 = spv.IMul %87, %cst3_i32 : i32
          %178 = spv.IAdd %176, %177 : i32
          %179 = spv.IAdd %178, %cst6_i32 : i32
          %180 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %179] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %181 = spv.Load "StorageBuffer" %180 : f32
          %182 = spv.CompositeInsert %181, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %183 = spv.IMul %72, %cst675_i32 : i32
          %184 = spv.IMul %21, %cst675_i32 : i32
          %185 = spv.IAdd %183, %184 : i32
          %186 = spv.IMul %17, %cst675_i32 : i32
          %187 = spv.IAdd %185, %186 : i32
          %188 = spv.IMul %8, %cst3_i32 : i32
          %189 = spv.IAdd %187, %188 : i32
          %190 = spv.IMul %18, %cst3_i32 : i32
          %191 = spv.IAdd %189, %190 : i32
          %192 = spv.IMul %87, %cst3_i32 : i32
          %193 = spv.IAdd %191, %192 : i32
          %194 = spv.IAdd %193, %cst7_i32 : i32
          %195 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %194] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %196 = spv.Load "StorageBuffer" %195 : f32
          %197 = spv.CompositeInsert %196, %182[1 : i32] : f32 into vector<3xf32>
          %198 = spv.IMul %72, %cst675_i32 : i32
          %199 = spv.IMul %21, %cst675_i32 : i32
          %200 = spv.IAdd %198, %199 : i32
          %201 = spv.IMul %17, %cst675_i32 : i32
          %202 = spv.IAdd %200, %201 : i32
          %203 = spv.IMul %8, %cst3_i32 : i32
          %204 = spv.IAdd %202, %203 : i32
          %205 = spv.IMul %18, %cst3_i32 : i32
          %206 = spv.IAdd %204, %205 : i32
          %207 = spv.IMul %87, %cst3_i32 : i32
          %208 = spv.IAdd %206, %207 : i32
          %209 = spv.IAdd %208, %cst8_i32 : i32
          %210 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %209] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %211 = spv.Load "StorageBuffer" %210 : f32
          %212 = spv.CompositeInsert %211, %197[2 : i32] : f32 into vector<3xf32>
          %213 = spv.CompositeExtract %212[0 : i32] : vector<3xf32>
          %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
          %215 = spv.GLSL.Fma %214, %100, %89 : vector<4xf32>
          %216 = spv.CompositeExtract %212[1 : i32] : vector<3xf32>
          %217 = spv.CompositeConstruct %216, %216, %216, %216 : vector<4xf32>
          %218 = spv.GLSL.Fma %217, %107, %215 : vector<4xf32>
          %219 = spv.CompositeExtract %212[2 : i32] : vector<3xf32>
          %220 = spv.CompositeConstruct %219, %219, %219, %219 : vector<4xf32>
          %221 = spv.GLSL.Fma %220, %114, %218 : vector<4xf32>
          %222 = spv.IMul %72, %cst675_i32 : i32
          %223 = spv.IMul %21, %cst675_i32 : i32
          %224 = spv.IAdd %222, %223 : i32
          %225 = spv.IMul %17, %cst675_i32 : i32
          %226 = spv.IAdd %224, %225 : i32
          %227 = spv.IMul %8, %cst3_i32 : i32
          %228 = spv.IAdd %226, %227 : i32
          %229 = spv.IMul %18, %cst3_i32 : i32
          %230 = spv.IAdd %228, %229 : i32
          %231 = spv.IMul %87, %cst3_i32 : i32
          %232 = spv.IAdd %230, %231 : i32
          %233 = spv.IAdd %232, %cst12_i32 : i32
          %234 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %233] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %235 = spv.Load "StorageBuffer" %234 : f32
          %236 = spv.CompositeInsert %235, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %237 = spv.IMul %72, %cst675_i32 : i32
          %238 = spv.IMul %21, %cst675_i32 : i32
          %239 = spv.IAdd %237, %238 : i32
          %240 = spv.IMul %17, %cst675_i32 : i32
          %241 = spv.IAdd %239, %240 : i32
          %242 = spv.IMul %8, %cst3_i32 : i32
          %243 = spv.IAdd %241, %242 : i32
          %244 = spv.IMul %18, %cst3_i32 : i32
          %245 = spv.IAdd %243, %244 : i32
          %246 = spv.IMul %87, %cst3_i32 : i32
          %247 = spv.IAdd %245, %246 : i32
          %248 = spv.IAdd %247, %cst13_i32 : i32
          %249 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %248] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %250 = spv.Load "StorageBuffer" %249 : f32
          %251 = spv.CompositeInsert %250, %236[1 : i32] : f32 into vector<3xf32>
          %252 = spv.IMul %72, %cst675_i32 : i32
          %253 = spv.IMul %21, %cst675_i32 : i32
          %254 = spv.IAdd %252, %253 : i32
          %255 = spv.IMul %17, %cst675_i32 : i32
          %256 = spv.IAdd %254, %255 : i32
          %257 = spv.IMul %8, %cst3_i32 : i32
          %258 = spv.IAdd %256, %257 : i32
          %259 = spv.IMul %18, %cst3_i32 : i32
          %260 = spv.IAdd %258, %259 : i32
          %261 = spv.IMul %87, %cst3_i32 : i32
          %262 = spv.IAdd %260, %261 : i32
          %263 = spv.IAdd %262, %cst14_i32 : i32
          %264 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %263] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %265 = spv.Load "StorageBuffer" %264 : f32
          %266 = spv.CompositeInsert %265, %251[2 : i32] : f32 into vector<3xf32>
          %267 = spv.CompositeExtract %266[0 : i32] : vector<3xf32>
          %268 = spv.CompositeConstruct %267, %267, %267, %267 : vector<4xf32>
          %269 = spv.GLSL.Fma %268, %100, %90 : vector<4xf32>
          %270 = spv.CompositeExtract %266[1 : i32] : vector<3xf32>
          %271 = spv.CompositeConstruct %270, %270, %270, %270 : vector<4xf32>
          %272 = spv.GLSL.Fma %271, %107, %269 : vector<4xf32>
          %273 = spv.CompositeExtract %266[2 : i32] : vector<3xf32>
          %274 = spv.CompositeConstruct %273, %273, %273, %273 : vector<4xf32>
          %275 = spv.GLSL.Fma %274, %114, %272 : vector<4xf32>
          %276 = spv.IMul %72, %cst675_i32 : i32
          %277 = spv.IMul %21, %cst675_i32 : i32
          %278 = spv.IAdd %276, %277 : i32
          %279 = spv.IMul %17, %cst675_i32 : i32
          %280 = spv.IAdd %278, %279 : i32
          %281 = spv.IMul %8, %cst3_i32 : i32
          %282 = spv.IAdd %280, %281 : i32
          %283 = spv.IMul %18, %cst3_i32 : i32
          %284 = spv.IAdd %282, %283 : i32
          %285 = spv.IMul %87, %cst3_i32 : i32
          %286 = spv.IAdd %284, %285 : i32
          %287 = spv.IAdd %286, %cst18_i32 : i32
          %288 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %287] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %289 = spv.Load "StorageBuffer" %288 : f32
          %290 = spv.CompositeInsert %289, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %291 = spv.IMul %72, %cst675_i32 : i32
          %292 = spv.IMul %21, %cst675_i32 : i32
          %293 = spv.IAdd %291, %292 : i32
          %294 = spv.IMul %17, %cst675_i32 : i32
          %295 = spv.IAdd %293, %294 : i32
          %296 = spv.IMul %8, %cst3_i32 : i32
          %297 = spv.IAdd %295, %296 : i32
          %298 = spv.IMul %18, %cst3_i32 : i32
          %299 = spv.IAdd %297, %298 : i32
          %300 = spv.IMul %87, %cst3_i32 : i32
          %301 = spv.IAdd %299, %300 : i32
          %302 = spv.IAdd %301, %cst19_i32 : i32
          %303 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %302] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %304 = spv.Load "StorageBuffer" %303 : f32
          %305 = spv.CompositeInsert %304, %290[1 : i32] : f32 into vector<3xf32>
          %306 = spv.IMul %72, %cst675_i32 : i32
          %307 = spv.IMul %21, %cst675_i32 : i32
          %308 = spv.IAdd %306, %307 : i32
          %309 = spv.IMul %17, %cst675_i32 : i32
          %310 = spv.IAdd %308, %309 : i32
          %311 = spv.IMul %8, %cst3_i32 : i32
          %312 = spv.IAdd %310, %311 : i32
          %313 = spv.IMul %18, %cst3_i32 : i32
          %314 = spv.IAdd %312, %313 : i32
          %315 = spv.IMul %87, %cst3_i32 : i32
          %316 = spv.IAdd %314, %315 : i32
          %317 = spv.IAdd %316, %cst20_i32 : i32
          %318 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %317] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %319 = spv.Load "StorageBuffer" %318 : f32
          %320 = spv.CompositeInsert %319, %305[2 : i32] : f32 into vector<3xf32>
          %321 = spv.CompositeExtract %320[0 : i32] : vector<3xf32>
          %322 = spv.CompositeConstruct %321, %321, %321, %321 : vector<4xf32>
          %323 = spv.GLSL.Fma %322, %100, %91 : vector<4xf32>
          %324 = spv.CompositeExtract %320[1 : i32] : vector<3xf32>
          %325 = spv.CompositeConstruct %324, %324, %324, %324 : vector<4xf32>
          %326 = spv.GLSL.Fma %325, %107, %323 : vector<4xf32>
          %327 = spv.CompositeExtract %320[2 : i32] : vector<3xf32>
          %328 = spv.CompositeConstruct %327, %327, %327, %327 : vector<4xf32>
          %329 = spv.GLSL.Fma %328, %114, %326 : vector<4xf32>
          spv.Store "Function" %78, %167 : vector<4xf32>
          spv.Store "Function" %79, %221 : vector<4xf32>
          spv.Store "Function" %80, %275 : vector<4xf32>
          spv.Store "Function" %81, %329 : vector<4xf32>
          %330 = spv.IAdd %87, %cst1_i32 : i32
          spv.Branch ^bb1(%330, %167, %221, %275, %329 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        %82 = spv.Load "Function" %81 : vector<4xf32>
        %83 = spv.Load "Function" %80 : vector<4xf32>
        %84 = spv.Load "Function" %79 : vector<4xf32>
        %85 = spv.Load "Function" %78 : vector<4xf32>
        spv.Store "Function" %22, %85 : vector<4xf32>
        spv.Store "Function" %23, %84 : vector<4xf32>
        spv.Store "Function" %24, %83 : vector<4xf32>
        spv.Store "Function" %25, %82 : vector<4xf32>
        %86 = spv.IAdd %72, %cst1_i32 : i32
        spv.Branch ^bb1(%86, %85, %84, %83, %82 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %26 = spv.Load "Function" %25 : vector<4xf32>
      %27 = spv.Load "Function" %24 : vector<4xf32>
      %28 = spv.Load "Function" %23 : vector<4xf32>
      %29 = spv.Load "Function" %22 : vector<4xf32>
      %30 = spv.IAdd %7, %16 : i32
      %31 = spv.SDiv %30, %cst4_i32 : i32
      %32 = spv.IMul %19, %cst896_i32 : i32
      %33 = spv.IAdd %31, %32 : i32
      %34 = spv.IMul %14, %cst896_i32 : i32
      %35 = spv.IAdd %33, %34 : i32
      %36 = spv.IMul %6, %cst8_i32 : i32
      %37 = spv.IAdd %35, %36 : i32
      %38 = spv.IMul %15, %cst8_i32 : i32
      %39 = spv.IAdd %37, %38 : i32
      %40 = spv.IAdd %39, %cst24_i32 : i32
      %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
      %42 = spv.IMul %19, %cst896_i32 : i32
      %43 = spv.IAdd %31, %42 : i32
      %44 = spv.IMul %14, %cst896_i32 : i32
      %45 = spv.IAdd %43, %44 : i32
      %46 = spv.IMul %6, %cst8_i32 : i32
      %47 = spv.IAdd %45, %46 : i32
      %48 = spv.IMul %15, %cst8_i32 : i32
      %49 = spv.IAdd %47, %48 : i32
      %50 = spv.IAdd %49, %cst16_i32 : i32
      %51 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %50] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %51, %27 : vector<4xf32>
      %52 = spv.IMul %19, %cst896_i32 : i32
      %53 = spv.IAdd %31, %52 : i32
      %54 = spv.IMul %14, %cst896_i32 : i32
      %55 = spv.IAdd %53, %54 : i32
      %56 = spv.IMul %6, %cst8_i32 : i32
      %57 = spv.IAdd %55, %56 : i32
      %58 = spv.IMul %15, %cst8_i32 : i32
      %59 = spv.IAdd %57, %58 : i32
      %60 = spv.IAdd %59, %cst8_i32 : i32
      %61 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %60] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %61, %28 : vector<4xf32>
      %62 = spv.IMul %19, %cst896_i32 : i32
      %63 = spv.IAdd %31, %62 : i32
      %64 = spv.IMul %14, %cst896_i32 : i32
      %65 = spv.IAdd %63, %64 : i32
      %66 = spv.IMul %6, %cst8_i32 : i32
      %67 = spv.IAdd %65, %66 : i32
      %68 = spv.IMul %15, %cst8_i32 : i32
      %69 = spv.IAdd %67, %68 : i32
      %70 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %69] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %70, %29 : vector<4xf32>
      %71 = spv.IAdd %19, %cst112_i32 : i32
      spv.Branch ^bb1(%71 : i32)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}

// -----// IR Dump After CSE //----- //
spv.module Logical GLSL450 {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %cst896_i32 = spv.Constant 896 : i32
    %cst20_i32 = spv.Constant 20 : i32
    %cst19_i32 = spv.Constant 19 : i32
    %cst18_i32 = spv.Constant 18 : i32
    %cst14_i32 = spv.Constant 14 : i32
    %cst13_i32 = spv.Constant 13 : i32
    %cst12_i32 = spv.Constant 12 : i32
    %cst7_i32 = spv.Constant 7 : i32
    %cst6_i32 = spv.Constant 6 : i32
    %cst675_i32 = spv.Constant 675 : i32
    %cst24_i32 = spv.Constant 24 : i32
    %cst72_i32 = spv.Constant 72 : i32
    %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
    %cst8_i32 = spv.Constant 8 : i32
    %cst2_i32 = spv.Constant 2 : i32
    %cst4_i32 = spv.Constant 4 : i32
    %cst16_i32 = spv.Constant 16 : i32
    %cst32_i32 = spv.Constant 32 : i32
    %cst0_i32 = spv.Constant 0 : i32
    %cst112_i32 = spv.Constant 112 : i32
    %cst3_i32 = spv.Constant 3 : i32
    %cst1_i32 = spv.Constant 1 : i32
    %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
    %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
    %6 = spv.IMul %3, %cst8_i32 : i32
    %7 = spv.IMul %1, %cst32_i32 : i32
    %8 = spv.IMul %3, %cst16_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
    %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
    %15 = spv.IMul %12, %cst4_i32 : i32
    %16 = spv.IMul %10, %cst4_i32 : i32
    %17 = spv.IMul %14, %cst2_i32 : i32
    %18 = spv.IMul %12, %cst8_i32 : i32
    spv.mlir.loop {
      spv.Branch ^bb1(%5 : i32)
    ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
      %20 = spv.SLessThan %19, %cst112_i32 : i32
      spv.BranchConditional %20, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %21 = spv.IMul %19, %cst2_i32 : i32
      %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %53 = spv.SLessThan %48, %cst3_i32 : i32
        spv.BranchConditional %53, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        spv.mlir.loop {
          spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
          %68 = spv.SLessThan %63, %cst3_i32 : i32
          spv.BranchConditional %68, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %69 = spv.IAdd %7, %16 : i32
          %70 = spv.SDiv %69, %cst4_i32 : i32
          %71 = spv.IMul %48, %cst72_i32 : i32
          %72 = spv.IMul %63, %cst24_i32 : i32
          %73 = spv.IAdd %71, %72 : i32
          %74 = spv.IAdd %73, %70 : i32
          %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
          %77 = spv.IAdd %74, %cst8_i32 : i32
          %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
          %80 = spv.IAdd %74, %cst16_i32 : i32
          %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
          %83 = spv.IMul %48, %cst675_i32 : i32
          %84 = spv.IMul %63, %cst3_i32 : i32
          %85 = spv.IAdd %83, %84 : i32
          %86 = spv.IMul %21, %cst675_i32 : i32
          %87 = spv.IAdd %85, %86 : i32
          %88 = spv.IMul %17, %cst675_i32 : i32
          %89 = spv.IAdd %87, %88 : i32
          %90 = spv.IMul %8, %cst3_i32 : i32
          %91 = spv.IAdd %89, %90 : i32
          %92 = spv.IMul %18, %cst3_i32 : i32
          %93 = spv.IAdd %91, %92 : i32
          %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %95 = spv.Load "StorageBuffer" %94 : f32
          %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %97 = spv.IAdd %93, %cst1_i32 : i32
          %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %99 = spv.Load "StorageBuffer" %98 : f32
          %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
          %101 = spv.IAdd %93, %cst2_i32 : i32
          %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %103 = spv.Load "StorageBuffer" %102 : f32
          %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
          %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
          %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
          %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
          %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
          %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
          %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
          %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
          %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
          %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
          %114 = spv.IAdd %83, %86 : i32
          %115 = spv.IAdd %114, %88 : i32
          %116 = spv.IAdd %115, %90 : i32
          %117 = spv.IAdd %116, %92 : i32
          %118 = spv.IAdd %117, %84 : i32
          %119 = spv.IAdd %118, %cst6_i32 : i32
          %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %121 = spv.Load "StorageBuffer" %120 : f32
          %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %123 = spv.IAdd %118, %cst7_i32 : i32
          %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %125 = spv.Load "StorageBuffer" %124 : f32
          %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
          %127 = spv.IAdd %118, %cst8_i32 : i32
          %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %129 = spv.Load "StorageBuffer" %128 : f32
          %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
          %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
          %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
          %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
          %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
          %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
          %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
          %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
          %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
          %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
          %140 = spv.IAdd %118, %cst12_i32 : i32
          %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %142 = spv.Load "StorageBuffer" %141 : f32
          %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %144 = spv.IAdd %118, %cst13_i32 : i32
          %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %146 = spv.Load "StorageBuffer" %145 : f32
          %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
          %148 = spv.IAdd %118, %cst14_i32 : i32
          %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %150 = spv.Load "StorageBuffer" %149 : f32
          %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
          %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
          %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
          %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
          %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
          %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
          %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
          %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
          %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
          %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
          %161 = spv.IAdd %118, %cst18_i32 : i32
          %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %163 = spv.Load "StorageBuffer" %162 : f32
          %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %165 = spv.IAdd %118, %cst19_i32 : i32
          %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %167 = spv.Load "StorageBuffer" %166 : f32
          %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
          %169 = spv.IAdd %118, %cst20_i32 : i32
          %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %171 = spv.Load "StorageBuffer" %170 : f32
          %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
          %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
          %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
          %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
          %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
          %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
          %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
          %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
          %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
          %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
          spv.Store "Function" %54, %113 : vector<4xf32>
          spv.Store "Function" %55, %139 : vector<4xf32>
          spv.Store "Function" %56, %160 : vector<4xf32>
          spv.Store "Function" %57, %181 : vector<4xf32>
          %182 = spv.IAdd %63, %cst1_i32 : i32
          spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        %58 = spv.Load "Function" %57 : vector<4xf32>
        %59 = spv.Load "Function" %56 : vector<4xf32>
        %60 = spv.Load "Function" %55 : vector<4xf32>
        %61 = spv.Load "Function" %54 : vector<4xf32>
        spv.Store "Function" %22, %61 : vector<4xf32>
        spv.Store "Function" %23, %60 : vector<4xf32>
        spv.Store "Function" %24, %59 : vector<4xf32>
        spv.Store "Function" %25, %58 : vector<4xf32>
        %62 = spv.IAdd %48, %cst1_i32 : i32
        spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %26 = spv.Load "Function" %25 : vector<4xf32>
      %27 = spv.Load "Function" %24 : vector<4xf32>
      %28 = spv.Load "Function" %23 : vector<4xf32>
      %29 = spv.Load "Function" %22 : vector<4xf32>
      %30 = spv.IAdd %7, %16 : i32
      %31 = spv.SDiv %30, %cst4_i32 : i32
      %32 = spv.IMul %19, %cst896_i32 : i32
      %33 = spv.IAdd %31, %32 : i32
      %34 = spv.IMul %14, %cst896_i32 : i32
      %35 = spv.IAdd %33, %34 : i32
      %36 = spv.IMul %6, %cst8_i32 : i32
      %37 = spv.IAdd %35, %36 : i32
      %38 = spv.IMul %15, %cst8_i32 : i32
      %39 = spv.IAdd %37, %38 : i32
      %40 = spv.IAdd %39, %cst24_i32 : i32
      %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
      %42 = spv.IAdd %39, %cst16_i32 : i32
      %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
      %44 = spv.IAdd %39, %cst8_i32 : i32
      %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
      %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
      %47 = spv.IAdd %19, %cst112_i32 : i32
      spv.Branch ^bb1(%47 : i32)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}

// -----// IR Dump After SPIRVUpdateVCE //----- //
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %cst896_i32 = spv.Constant 896 : i32
    %cst20_i32 = spv.Constant 20 : i32
    %cst19_i32 = spv.Constant 19 : i32
    %cst18_i32 = spv.Constant 18 : i32
    %cst14_i32 = spv.Constant 14 : i32
    %cst13_i32 = spv.Constant 13 : i32
    %cst12_i32 = spv.Constant 12 : i32
    %cst7_i32 = spv.Constant 7 : i32
    %cst6_i32 = spv.Constant 6 : i32
    %cst675_i32 = spv.Constant 675 : i32
    %cst24_i32 = spv.Constant 24 : i32
    %cst72_i32 = spv.Constant 72 : i32
    %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
    %cst8_i32 = spv.Constant 8 : i32
    %cst2_i32 = spv.Constant 2 : i32
    %cst4_i32 = spv.Constant 4 : i32
    %cst16_i32 = spv.Constant 16 : i32
    %cst32_i32 = spv.Constant 32 : i32
    %cst0_i32 = spv.Constant 0 : i32
    %cst112_i32 = spv.Constant 112 : i32
    %cst3_i32 = spv.Constant 3 : i32
    %cst1_i32 = spv.Constant 1 : i32
    %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
    %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
    %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
    %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
    %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
    %6 = spv.IMul %3, %cst8_i32 : i32
    %7 = spv.IMul %1, %cst32_i32 : i32
    %8 = spv.IMul %3, %cst16_i32 : i32
    %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
    %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
    %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
    %15 = spv.IMul %12, %cst4_i32 : i32
    %16 = spv.IMul %10, %cst4_i32 : i32
    %17 = spv.IMul %14, %cst2_i32 : i32
    %18 = spv.IMul %12, %cst8_i32 : i32
    spv.mlir.loop {
      spv.Branch ^bb1(%5 : i32)
    ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
      %20 = spv.SLessThan %19, %cst112_i32 : i32
      spv.BranchConditional %20, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %21 = spv.IMul %19, %cst2_i32 : i32
      %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %53 = spv.SLessThan %48, %cst3_i32 : i32
        spv.BranchConditional %53, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        spv.mlir.loop {
          spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
          %68 = spv.SLessThan %63, %cst3_i32 : i32
          spv.BranchConditional %68, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %69 = spv.IAdd %7, %16 : i32
          %70 = spv.SDiv %69, %cst4_i32 : i32
          %71 = spv.IMul %48, %cst72_i32 : i32
          %72 = spv.IMul %63, %cst24_i32 : i32
          %73 = spv.IAdd %71, %72 : i32
          %74 = spv.IAdd %73, %70 : i32
          %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
          %77 = spv.IAdd %74, %cst8_i32 : i32
          %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
          %80 = spv.IAdd %74, %cst16_i32 : i32
          %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
          %83 = spv.IMul %48, %cst675_i32 : i32
          %84 = spv.IMul %63, %cst3_i32 : i32
          %85 = spv.IAdd %83, %84 : i32
          %86 = spv.IMul %21, %cst675_i32 : i32
          %87 = spv.IAdd %85, %86 : i32
          %88 = spv.IMul %17, %cst675_i32 : i32
          %89 = spv.IAdd %87, %88 : i32
          %90 = spv.IMul %8, %cst3_i32 : i32
          %91 = spv.IAdd %89, %90 : i32
          %92 = spv.IMul %18, %cst3_i32 : i32
          %93 = spv.IAdd %91, %92 : i32
          %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %95 = spv.Load "StorageBuffer" %94 : f32
          %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %97 = spv.IAdd %93, %cst1_i32 : i32
          %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %99 = spv.Load "StorageBuffer" %98 : f32
          %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
          %101 = spv.IAdd %93, %cst2_i32 : i32
          %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %103 = spv.Load "StorageBuffer" %102 : f32
          %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
          %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
          %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
          %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
          %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
          %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
          %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
          %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
          %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
          %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
          %114 = spv.IAdd %83, %86 : i32
          %115 = spv.IAdd %114, %88 : i32
          %116 = spv.IAdd %115, %90 : i32
          %117 = spv.IAdd %116, %92 : i32
          %118 = spv.IAdd %117, %84 : i32
          %119 = spv.IAdd %118, %cst6_i32 : i32
          %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %121 = spv.Load "StorageBuffer" %120 : f32
          %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %123 = spv.IAdd %118, %cst7_i32 : i32
          %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %125 = spv.Load "StorageBuffer" %124 : f32
          %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
          %127 = spv.IAdd %118, %cst8_i32 : i32
          %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %129 = spv.Load "StorageBuffer" %128 : f32
          %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
          %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
          %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
          %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
          %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
          %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
          %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
          %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
          %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
          %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
          %140 = spv.IAdd %118, %cst12_i32 : i32
          %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %142 = spv.Load "StorageBuffer" %141 : f32
          %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %144 = spv.IAdd %118, %cst13_i32 : i32
          %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %146 = spv.Load "StorageBuffer" %145 : f32
          %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
          %148 = spv.IAdd %118, %cst14_i32 : i32
          %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %150 = spv.Load "StorageBuffer" %149 : f32
          %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
          %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
          %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
          %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
          %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
          %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
          %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
          %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
          %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
          %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
          %161 = spv.IAdd %118, %cst18_i32 : i32
          %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %163 = spv.Load "StorageBuffer" %162 : f32
          %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
          %165 = spv.IAdd %118, %cst19_i32 : i32
          %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %167 = spv.Load "StorageBuffer" %166 : f32
          %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
          %169 = spv.IAdd %118, %cst20_i32 : i32
          %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %171 = spv.Load "StorageBuffer" %170 : f32
          %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
          %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
          %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
          %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
          %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
          %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
          %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
          %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
          %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
          %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
          spv.Store "Function" %54, %113 : vector<4xf32>
          spv.Store "Function" %55, %139 : vector<4xf32>
          spv.Store "Function" %56, %160 : vector<4xf32>
          spv.Store "Function" %57, %181 : vector<4xf32>
          %182 = spv.IAdd %63, %cst1_i32 : i32
          spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        %58 = spv.Load "Function" %57 : vector<4xf32>
        %59 = spv.Load "Function" %56 : vector<4xf32>
        %60 = spv.Load "Function" %55 : vector<4xf32>
        %61 = spv.Load "Function" %54 : vector<4xf32>
        spv.Store "Function" %22, %61 : vector<4xf32>
        spv.Store "Function" %23, %60 : vector<4xf32>
        spv.Store "Function" %24, %59 : vector<4xf32>
        spv.Store "Function" %25, %58 : vector<4xf32>
        %62 = spv.IAdd %48, %cst1_i32 : i32
        spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %26 = spv.Load "Function" %25 : vector<4xf32>
      %27 = spv.Load "Function" %24 : vector<4xf32>
      %28 = spv.Load "Function" %23 : vector<4xf32>
      %29 = spv.Load "Function" %22 : vector<4xf32>
      %30 = spv.IAdd %7, %16 : i32
      %31 = spv.SDiv %30, %cst4_i32 : i32
      %32 = spv.IMul %19, %cst896_i32 : i32
      %33 = spv.IAdd %31, %32 : i32
      %34 = spv.IMul %14, %cst896_i32 : i32
      %35 = spv.IAdd %33, %34 : i32
      %36 = spv.IMul %6, %cst8_i32 : i32
      %37 = spv.IAdd %35, %36 : i32
      %38 = spv.IMul %15, %cst8_i32 : i32
      %39 = spv.IAdd %37, %38 : i32
      %40 = spv.IAdd %39, %cst24_i32 : i32
      %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
      %42 = spv.IAdd %39, %cst16_i32 : i32
      %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
      %44 = spv.IAdd %39, %cst8_i32 : i32
      %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
      %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
      %47 = spv.IAdd %19, %cst112_i32 : i32
      spv.Branch ^bb1(%47 : i32)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
  hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
    %c1 = constant 1 : index
    %c14 = constant 14 : index
    %c112 = constant 112 : index
    hal.return %c1, %c14, %c112 : index, index, index
  }
  builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
    spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
      spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
      spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
      spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
      spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
      spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
      spv.func @conv_dispatch_0() "None" {
        %cst896_i32 = spv.Constant 896 : i32
        %cst20_i32 = spv.Constant 20 : i32
        %cst19_i32 = spv.Constant 19 : i32
        %cst18_i32 = spv.Constant 18 : i32
        %cst14_i32 = spv.Constant 14 : i32
        %cst13_i32 = spv.Constant 13 : i32
        %cst12_i32 = spv.Constant 12 : i32
        %cst7_i32 = spv.Constant 7 : i32
        %cst6_i32 = spv.Constant 6 : i32
        %cst675_i32 = spv.Constant 675 : i32
        %cst24_i32 = spv.Constant 24 : i32
        %cst72_i32 = spv.Constant 72 : i32
        %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
        %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
        %cst8_i32 = spv.Constant 8 : i32
        %cst2_i32 = spv.Constant 2 : i32
        %cst4_i32 = spv.Constant 4 : i32
        %cst16_i32 = spv.Constant 16 : i32
        %cst32_i32 = spv.Constant 32 : i32
        %cst0_i32 = spv.Constant 0 : i32
        %cst112_i32 = spv.Constant 112 : i32
        %cst3_i32 = spv.Constant 3 : i32
        %cst1_i32 = spv.Constant 1 : i32
        %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
        %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
        %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
        %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
        %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
        %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
        %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
        %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
        %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
        %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
        %6 = spv.IMul %3, %cst8_i32 : i32
        %7 = spv.IMul %1, %cst32_i32 : i32
        %8 = spv.IMul %3, %cst16_i32 : i32
        %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
        %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
        %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
        %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
        %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
        %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
        %15 = spv.IMul %12, %cst4_i32 : i32
        %16 = spv.IMul %10, %cst4_i32 : i32
        %17 = spv.IMul %14, %cst2_i32 : i32
        %18 = spv.IMul %12, %cst8_i32 : i32
        spv.mlir.loop {
          spv.Branch ^bb1(%5 : i32)
        ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
          %20 = spv.SLessThan %19, %cst112_i32 : i32
          spv.BranchConditional %20, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %21 = spv.IMul %19, %cst2_i32 : i32
          %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          spv.mlir.loop {
            spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
          ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
            %53 = spv.SLessThan %48, %cst3_i32 : i32
            spv.BranchConditional %53, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            spv.mlir.loop {
              spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
              %68 = spv.SLessThan %63, %cst3_i32 : i32
              spv.BranchConditional %68, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %69 = spv.IAdd %7, %16 : i32
              %70 = spv.SDiv %69, %cst4_i32 : i32
              %71 = spv.IMul %48, %cst72_i32 : i32
              %72 = spv.IMul %63, %cst24_i32 : i32
              %73 = spv.IAdd %71, %72 : i32
              %74 = spv.IAdd %73, %70 : i32
              %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
              %77 = spv.IAdd %74, %cst8_i32 : i32
              %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
              %80 = spv.IAdd %74, %cst16_i32 : i32
              %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
              %83 = spv.IMul %48, %cst675_i32 : i32
              %84 = spv.IMul %63, %cst3_i32 : i32
              %85 = spv.IAdd %83, %84 : i32
              %86 = spv.IMul %21, %cst675_i32 : i32
              %87 = spv.IAdd %85, %86 : i32
              %88 = spv.IMul %17, %cst675_i32 : i32
              %89 = spv.IAdd %87, %88 : i32
              %90 = spv.IMul %8, %cst3_i32 : i32
              %91 = spv.IAdd %89, %90 : i32
              %92 = spv.IMul %18, %cst3_i32 : i32
              %93 = spv.IAdd %91, %92 : i32
              %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %95 = spv.Load "StorageBuffer" %94 : f32
              %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
              %97 = spv.IAdd %93, %cst1_i32 : i32
              %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %99 = spv.Load "StorageBuffer" %98 : f32
              %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
              %101 = spv.IAdd %93, %cst2_i32 : i32
              %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %103 = spv.Load "StorageBuffer" %102 : f32
              %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
              %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
              %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
              %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
              %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
              %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
              %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
              %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
              %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
              %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
              %114 = spv.IAdd %83, %86 : i32
              %115 = spv.IAdd %114, %88 : i32
              %116 = spv.IAdd %115, %90 : i32
              %117 = spv.IAdd %116, %92 : i32
              %118 = spv.IAdd %117, %84 : i32
              %119 = spv.IAdd %118, %cst6_i32 : i32
              %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %121 = spv.Load "StorageBuffer" %120 : f32
              %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
              %123 = spv.IAdd %118, %cst7_i32 : i32
              %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %125 = spv.Load "StorageBuffer" %124 : f32
              %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
              %127 = spv.IAdd %118, %cst8_i32 : i32
              %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %129 = spv.Load "StorageBuffer" %128 : f32
              %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
              %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
              %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
              %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
              %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
              %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
              %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
              %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
              %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
              %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
              %140 = spv.IAdd %118, %cst12_i32 : i32
              %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %142 = spv.Load "StorageBuffer" %141 : f32
              %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
              %144 = spv.IAdd %118, %cst13_i32 : i32
              %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %146 = spv.Load "StorageBuffer" %145 : f32
              %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
              %148 = spv.IAdd %118, %cst14_i32 : i32
              %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %150 = spv.Load "StorageBuffer" %149 : f32
              %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
              %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
              %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
              %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
              %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
              %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
              %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
              %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
              %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
              %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
              %161 = spv.IAdd %118, %cst18_i32 : i32
              %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %163 = spv.Load "StorageBuffer" %162 : f32
              %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
              %165 = spv.IAdd %118, %cst19_i32 : i32
              %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %167 = spv.Load "StorageBuffer" %166 : f32
              %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
              %169 = spv.IAdd %118, %cst20_i32 : i32
              %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
              %171 = spv.Load "StorageBuffer" %170 : f32
              %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
              %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
              %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
              %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
              %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
              %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
              %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
              %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
              %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
              %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
              spv.Store "Function" %54, %113 : vector<4xf32>
              spv.Store "Function" %55, %139 : vector<4xf32>
              spv.Store "Function" %56, %160 : vector<4xf32>
              spv.Store "Function" %57, %181 : vector<4xf32>
              %182 = spv.IAdd %63, %cst1_i32 : i32
              spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            %58 = spv.Load "Function" %57 : vector<4xf32>
            %59 = spv.Load "Function" %56 : vector<4xf32>
            %60 = spv.Load "Function" %55 : vector<4xf32>
            %61 = spv.Load "Function" %54 : vector<4xf32>
            spv.Store "Function" %22, %61 : vector<4xf32>
            spv.Store "Function" %23, %60 : vector<4xf32>
            spv.Store "Function" %24, %59 : vector<4xf32>
            spv.Store "Function" %25, %58 : vector<4xf32>
            %62 = spv.IAdd %48, %cst1_i32 : i32
            spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
          ^bb3:  // pred: ^bb1
            spv.mlir.merge
          }
          %26 = spv.Load "Function" %25 : vector<4xf32>
          %27 = spv.Load "Function" %24 : vector<4xf32>
          %28 = spv.Load "Function" %23 : vector<4xf32>
          %29 = spv.Load "Function" %22 : vector<4xf32>
          %30 = spv.IAdd %7, %16 : i32
          %31 = spv.SDiv %30, %cst4_i32 : i32
          %32 = spv.IMul %19, %cst896_i32 : i32
          %33 = spv.IAdd %31, %32 : i32
          %34 = spv.IMul %14, %cst896_i32 : i32
          %35 = spv.IAdd %33, %34 : i32
          %36 = spv.IMul %6, %cst8_i32 : i32
          %37 = spv.IAdd %35, %36 : i32
          %38 = spv.IMul %15, %cst8_i32 : i32
          %39 = spv.IAdd %37, %38 : i32
          %40 = spv.IAdd %39, %cst24_i32 : i32
          %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
          %42 = spv.IAdd %39, %cst16_i32 : i32
          %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
          %44 = spv.IAdd %39, %cst8_i32 : i32
          %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
          %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
          %47 = spv.IAdd %19, %cst112_i32 : i32
          spv.Branch ^bb1(%47 : i32)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        spv.Return
      }
      spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
      spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
    }
    hal.interface private @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- //
hal.executable private @conv_dispatch_0 {
  hal.interface public @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
    hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
    ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
      %c1 = constant 1 : index
      %c14 = constant 14 : index
      %c112 = constant 112 : index
      hal.return %c1, %c14, %c112 : index, index, index
    }
    builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
      spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
        spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
        spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
        spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
        spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
        spv.func @conv_dispatch_0() "None" {
          %cst896_i32 = spv.Constant 896 : i32
          %cst20_i32 = spv.Constant 20 : i32
          %cst19_i32 = spv.Constant 19 : i32
          %cst18_i32 = spv.Constant 18 : i32
          %cst14_i32 = spv.Constant 14 : i32
          %cst13_i32 = spv.Constant 13 : i32
          %cst12_i32 = spv.Constant 12 : i32
          %cst7_i32 = spv.Constant 7 : i32
          %cst6_i32 = spv.Constant 6 : i32
          %cst675_i32 = spv.Constant 675 : i32
          %cst24_i32 = spv.Constant 24 : i32
          %cst72_i32 = spv.Constant 72 : i32
          %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
          %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
          %cst8_i32 = spv.Constant 8 : i32
          %cst2_i32 = spv.Constant 2 : i32
          %cst4_i32 = spv.Constant 4 : i32
          %cst16_i32 = spv.Constant 16 : i32
          %cst32_i32 = spv.Constant 32 : i32
          %cst0_i32 = spv.Constant 0 : i32
          %cst112_i32 = spv.Constant 112 : i32
          %cst3_i32 = spv.Constant 3 : i32
          %cst1_i32 = spv.Constant 1 : i32
          %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
          %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
          %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
          %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
          %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
          %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
          %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
          %6 = spv.IMul %3, %cst8_i32 : i32
          %7 = spv.IMul %1, %cst32_i32 : i32
          %8 = spv.IMul %3, %cst16_i32 : i32
          %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
          %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
          %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
          %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
          %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
          %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
          %15 = spv.IMul %12, %cst4_i32 : i32
          %16 = spv.IMul %10, %cst4_i32 : i32
          %17 = spv.IMul %14, %cst2_i32 : i32
          %18 = spv.IMul %12, %cst8_i32 : i32
          spv.mlir.loop {
            spv.Branch ^bb1(%5 : i32)
          ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
            %20 = spv.SLessThan %19, %cst112_i32 : i32
            spv.BranchConditional %20, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %21 = spv.IMul %19, %cst2_i32 : i32
            %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            spv.mlir.loop {
              spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
              %53 = spv.SLessThan %48, %cst3_i32 : i32
              spv.BranchConditional %53, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %68 = spv.SLessThan %63, %cst3_i32 : i32
                spv.BranchConditional %68, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %69 = spv.IAdd %7, %16 : i32
                %70 = spv.SDiv %69, %cst4_i32 : i32
                %71 = spv.IMul %48, %cst72_i32 : i32
                %72 = spv.IMul %63, %cst24_i32 : i32
                %73 = spv.IAdd %71, %72 : i32
                %74 = spv.IAdd %73, %70 : i32
                %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                %77 = spv.IAdd %74, %cst8_i32 : i32
                %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                %80 = spv.IAdd %74, %cst16_i32 : i32
                %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                %83 = spv.IMul %48, %cst675_i32 : i32
                %84 = spv.IMul %63, %cst3_i32 : i32
                %85 = spv.IAdd %83, %84 : i32
                %86 = spv.IMul %21, %cst675_i32 : i32
                %87 = spv.IAdd %85, %86 : i32
                %88 = spv.IMul %17, %cst675_i32 : i32
                %89 = spv.IAdd %87, %88 : i32
                %90 = spv.IMul %8, %cst3_i32 : i32
                %91 = spv.IAdd %89, %90 : i32
                %92 = spv.IMul %18, %cst3_i32 : i32
                %93 = spv.IAdd %91, %92 : i32
                %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %95 = spv.Load "StorageBuffer" %94 : f32
                %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                %97 = spv.IAdd %93, %cst1_i32 : i32
                %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %99 = spv.Load "StorageBuffer" %98 : f32
                %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                %101 = spv.IAdd %93, %cst2_i32 : i32
                %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %103 = spv.Load "StorageBuffer" %102 : f32
                %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                %114 = spv.IAdd %83, %86 : i32
                %115 = spv.IAdd %114, %88 : i32
                %116 = spv.IAdd %115, %90 : i32
                %117 = spv.IAdd %116, %92 : i32
                %118 = spv.IAdd %117, %84 : i32
                %119 = spv.IAdd %118, %cst6_i32 : i32
                %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %121 = spv.Load "StorageBuffer" %120 : f32
                %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                %123 = spv.IAdd %118, %cst7_i32 : i32
                %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %125 = spv.Load "StorageBuffer" %124 : f32
                %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                %127 = spv.IAdd %118, %cst8_i32 : i32
                %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %129 = spv.Load "StorageBuffer" %128 : f32
                %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                %140 = spv.IAdd %118, %cst12_i32 : i32
                %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %142 = spv.Load "StorageBuffer" %141 : f32
                %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                %144 = spv.IAdd %118, %cst13_i32 : i32
                %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %146 = spv.Load "StorageBuffer" %145 : f32
                %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                %148 = spv.IAdd %118, %cst14_i32 : i32
                %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %150 = spv.Load "StorageBuffer" %149 : f32
                %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                %161 = spv.IAdd %118, %cst18_i32 : i32
                %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %163 = spv.Load "StorageBuffer" %162 : f32
                %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                %165 = spv.IAdd %118, %cst19_i32 : i32
                %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %167 = spv.Load "StorageBuffer" %166 : f32
                %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                %169 = spv.IAdd %118, %cst20_i32 : i32
                %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %171 = spv.Load "StorageBuffer" %170 : f32
                %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                spv.Store "Function" %54, %113 : vector<4xf32>
                spv.Store "Function" %55, %139 : vector<4xf32>
                spv.Store "Function" %56, %160 : vector<4xf32>
                spv.Store "Function" %57, %181 : vector<4xf32>
                %182 = spv.IAdd %63, %cst1_i32 : i32
                spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %58 = spv.Load "Function" %57 : vector<4xf32>
              %59 = spv.Load "Function" %56 : vector<4xf32>
              %60 = spv.Load "Function" %55 : vector<4xf32>
              %61 = spv.Load "Function" %54 : vector<4xf32>
              spv.Store "Function" %22, %61 : vector<4xf32>
              spv.Store "Function" %23, %60 : vector<4xf32>
              spv.Store "Function" %24, %59 : vector<4xf32>
              spv.Store "Function" %25, %58 : vector<4xf32>
              %62 = spv.IAdd %48, %cst1_i32 : i32
              spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            %26 = spv.Load "Function" %25 : vector<4xf32>
            %27 = spv.Load "Function" %24 : vector<4xf32>
            %28 = spv.Load "Function" %23 : vector<4xf32>
            %29 = spv.Load "Function" %22 : vector<4xf32>
            %30 = spv.IAdd %7, %16 : i32
            %31 = spv.SDiv %30, %cst4_i32 : i32
            %32 = spv.IMul %19, %cst896_i32 : i32
            %33 = spv.IAdd %31, %32 : i32
            %34 = spv.IMul %14, %cst896_i32 : i32
            %35 = spv.IAdd %33, %34 : i32
            %36 = spv.IMul %6, %cst8_i32 : i32
            %37 = spv.IAdd %35, %36 : i32
            %38 = spv.IMul %15, %cst8_i32 : i32
            %39 = spv.IAdd %37, %38 : i32
            %40 = spv.IAdd %39, %cst24_i32 : i32
            %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
            %42 = spv.IAdd %39, %cst16_i32 : i32
            %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
            %44 = spv.IAdd %39, %cst8_i32 : i32
            %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
            %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
            %47 = spv.IAdd %19, %cst112_i32 : i32
            spv.Branch ^bb1(%47 : i32)
          ^bb3:  // pred: ^bb1
            spv.mlir.merge
          }
          spv.Return
        }
        spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
        spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
      }
      hal.interface private @io {
        hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
        hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
        hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
      }
    }
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
    %1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
    %2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %4 : tensor<1x112x112x32xf32>
    }
    %3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %c50331680_i32 = constant 50331680 : i32
    %c1_i32 = constant 1 : i32
    %c1 = constant 1 : index
    %c225 = constant 225 : index
    %c3 = constant 3 : index
    %sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c225, %c225, %c3]) type(%c50331680_i32) encoding(%c1_i32) : index
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %c32_1 = constant 32 : index
    %sz_2 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c3, %c3, %c3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
    %c112_3 = constant 112 : index
    %sz_4 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112_3, %c112_3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
    %buffer_5 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz_4}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %c112_6 = constant 112 : index
    %c32_7 = constant 32 : index
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %sz],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %sz_2],
        %c2 = (%buffer_5 : !hal.buffer)[%c0, %sz_4]
      ])
      %c1_14 = constant 1 : index
      %c14 = constant 14 : index
      %c112_15 = constant 112 : index
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1_14, %c14, %c112_15])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %c1_8 = constant 1 : index
    %c112_9 = constant 112 : index
    %c112_10 = constant 112 : index
    %c32_11 = constant 32 : index
    %c50331680_i32_12 = constant 50331680 : i32
    %c1_i32_13 = constant 1 : i32
    %view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c1_8, %c112_9, %c112_10, %c32_11]) type(%c50331680_i32_12) encoding(%c1_i32_13) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionRankedShapeDimsPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %c50331680_i32 = constant 50331680 : i32
    %c1_i32 = constant 1 : i32
    %c1 = constant 1 : index
    %c225 = constant 225 : index
    %c3 = constant 3 : index
    %sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c225, %c225, %c3]) type(%c50331680_i32) encoding(%c1_i32) : index
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %c32_1 = constant 32 : index
    %sz_2 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c3, %c3, %c3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
    %c112_3 = constant 112 : index
    %sz_4 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112_3, %c112_3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
    %buffer_5 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz_4}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %c112_6 = constant 112 : index
    %c32_7 = constant 32 : index
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %sz],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %sz_2],
        %c2 = (%buffer_5 : !hal.buffer)[%c0, %sz_4]
      ])
      %c1_14 = constant 1 : index
      %c14 = constant 14 : index
      %c112_15 = constant 112 : index
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1_14, %c14, %c112_15])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %c1_8 = constant 1 : index
    %c112_9 = constant 112 : index
    %c112_10 = constant 112 : index
    %c32_11 = constant 32 : index
    %c50331680_i32_12 = constant 50331680 : i32
    %c1_i32_13 = constant 1 : i32
    %view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c1_8, %c112_9, %c112_10, %c32_11]) type(%c50331680_i32_12) encoding(%c1_i32_13) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c1_i32 = constant 1 : i32
  %c50331680_i32 = constant 50331680 : i32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %c1 = constant 1 : index
  %c14 = constant 14 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %c1605632 = constant 1605632 : index
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
  hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
    ])
    hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c1_i32 = constant 1 : i32
  %c50331680_i32 = constant 50331680 : i32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %c1 = constant 1 : index
  %c14 = constant 14 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %c1605632 = constant 1605632 : index
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
  hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
    ])
    hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PackAllocationsPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c1_i32 = constant 1 : i32
  %c50331680_i32 = constant 50331680 : i32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %c1 = constant 1 : index
  %c14 = constant 14 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %c1605632 = constant 1605632 : index
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
  hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
    ])
    hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkTargetExecutablesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = constant 1 : i32
    %c50331680_i32 = constant 50331680 : i32
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %c1 = constant 1 : index
    %c14 = constant 14 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1605632 = constant 1605632 : index
    %c3456 = constant 3456 : index
    %c607500 = constant 607500 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkExecutablesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = constant 1 : i32
    %c50331680_i32 = constant 50331680 : i32
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %c1 = constant 1 : index
    %c14 = constant 14 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1605632 = constant 1605632 : index
    %c3456 = constant 3456 : index
    %c607500 = constant 607500 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ResolveEntryPointOrdinalsPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = constant 1 : i32
    %c50331680_i32 = constant 50331680 : i32
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %c1 = constant 1 : index
    %c14 = constant 14 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1605632 = constant 1605632 : index
    %c3456 = constant 3456 : index
    %c607500 = constant 607500 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
      ])
      %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
      %exe = hal.executable.lookup device(%0 : !hal.device) executable(@conv_dispatch_0) : !hal.executable
      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c14, %c112])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c607500 = constant 607500 : index
  %c3456 = constant 3456 : index
  %c1605632 = constant 1605632 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c14 = constant 14 : index
  %c1 = constant 1 : index
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %c50331680_i32 = constant 50331680 : i32
  %c1_i32 = constant 1 : i32
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
  hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
    ])
    %exe = hal.executable.lookup device(%device : !hal.device) executable(@conv_dispatch_0) : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c14, %c112])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c607500 = constant 607500 : index
  %c3456 = constant 3456 : index
  %c1605632 = constant 1605632 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c14 = constant 14 : index
  %c1 = constant 1 : index
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %c50331680_i32 = constant 50331680 : i32
  %c1_i32 = constant 1 : i32
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
  hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
    ])
    %exe = hal.executable.lookup device(%device : !hal.device) executable(@conv_dispatch_0) : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c14, %c112])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_conv_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %0 = hal.device.switch<%device : !hal.device> -> !hal.executable
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
      %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
      %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
      hal.return %exe : !hal.executable
    },
    #hal.match.always {
      %1 = util.null : !hal.executable
      hal.return %1 : !hal.executable
    }
    util.global.store %0, @_executable_conv_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c607500 = constant 607500 : index
    %c3456 = constant 3456 : index
    %c1605632 = constant 1605632 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %c14 = constant 14 : index
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %c1_i32 = constant 1 : i32
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"vulkan-spirv-fb"> {
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
      ])
      %_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
  util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer.return
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
  %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  %device = hal.ex.shared_device : !hal.device
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  util.initializer.return
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
  cond_br %value, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
  br ^bb5(%exe : !hal.executable)
^bb2:  // pred: ^bb0
  %true = constant true
  cond_br %true, ^bb3, ^bb4
^bb3:  // pred: ^bb2
  %0 = util.null : !hal.executable
  br ^bb5(%0 : !hal.executable)
^bb4:  // pred: ^bb2
  util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
  util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
  util.initializer.return
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c607500 = constant 607500 : index
  %c3456 = constant 3456 : index
  %c1605632 = constant 1605632 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c14 = constant 14 : index
  %c1 = constant 1 : index
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %c50331680_i32 = constant 50331680 : i32
  %c1_i32 = constant 1 : i32
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
  cond_br %value, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
  ])
  %_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
  br ^bb3
^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
^bb3:  // pred: ^bb1
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
}

// -----// IR Dump After ConvertAffineToStandard //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_conv_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
    cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
    br ^bb5(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %true = constant true
    cond_br %true, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %0 = util.null : !hal.executable
    br ^bb5(%0 : !hal.executable)
  ^bb4:  // pred: ^bb2
    util.unreachable "device not supported in the compiled configuration"
  ^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
    util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c607500 = constant 607500 : index
    %c3456 = constant 3456 : index
    %c1605632 = constant 1605632 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %c14 = constant 14 : index
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %c1_i32 = constant 1 : i32
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
    cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
    ])
    %_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
    br ^bb3
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  ^bb3:  // pred: ^bb1
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]}  {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
    util.global.store %ok, @_device_query_0_ok : i1
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_conv_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
    cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
    br ^bb5(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %true = constant true
    cond_br %true, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %0 = util.null : !hal.executable
    br ^bb5(%0 : !hal.executable)
  ^bb4:  // pred: ^bb2
    util.unreachable "device not supported in the compiled configuration"
  ^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
    util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @conv_dispatch_0 {
    hal.interface public @io {
      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
      hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c1 = constant 1 : index
        %c14 = constant 14 : index
        %c112 = constant 112 : index
        hal.return %c1, %c14, %c112 : index, index, index
      }
      builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %cst896_i32 = spv.Constant 896 : i32
            %cst20_i32 = spv.Constant 20 : i32
            %cst19_i32 = spv.Constant 19 : i32
            %cst18_i32 = spv.Constant 18 : i32
            %cst14_i32 = spv.Constant 14 : i32
            %cst13_i32 = spv.Constant 13 : i32
            %cst12_i32 = spv.Constant 12 : i32
            %cst7_i32 = spv.Constant 7 : i32
            %cst6_i32 = spv.Constant 6 : i32
            %cst675_i32 = spv.Constant 675 : i32
            %cst24_i32 = spv.Constant 24 : i32
            %cst72_i32 = spv.Constant 72 : i32
            %cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
            %cst8_i32 = spv.Constant 8 : i32
            %cst2_i32 = spv.Constant 2 : i32
            %cst4_i32 = spv.Constant 4 : i32
            %cst16_i32 = spv.Constant 16 : i32
            %cst32_i32 = spv.Constant 32 : i32
            %cst0_i32 = spv.Constant 0 : i32
            %cst112_i32 = spv.Constant 112 : i32
            %cst3_i32 = spv.Constant 3 : i32
            %cst1_i32 = spv.Constant 1 : i32
            %__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
            %__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
            %2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
            %4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
            %5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
            %6 = spv.IMul %3, %cst8_i32 : i32
            %7 = spv.IMul %1, %cst32_i32 : i32
            %8 = spv.IMul %3, %cst16_i32 : i32
            %__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
            %11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
            %13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
            %14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
            %15 = spv.IMul %12, %cst4_i32 : i32
            %16 = spv.IMul %10, %cst4_i32 : i32
            %17 = spv.IMul %14, %cst2_i32 : i32
            %18 = spv.IMul %12, %cst8_i32 : i32
            spv.mlir.loop {
              spv.Branch ^bb1(%5 : i32)
            ^bb1(%19: i32):  // 2 preds: ^bb0, ^bb2
              %20 = spv.SLessThan %19, %cst112_i32 : i32
              spv.BranchConditional %20, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %21 = spv.IMul %19, %cst2_i32 : i32
              %22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %53 = spv.SLessThan %48, %cst3_i32 : i32
                spv.BranchConditional %53, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                %57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
                spv.mlir.loop {
                  spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %63, %cst3_i32 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %7, %16 : i32
                  %70 = spv.SDiv %69, %cst4_i32 : i32
                  %71 = spv.IMul %48, %cst72_i32 : i32
                  %72 = spv.IMul %63, %cst24_i32 : i32
                  %73 = spv.IAdd %71, %72 : i32
                  %74 = spv.IAdd %73, %70 : i32
                  %75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
                  %77 = spv.IAdd %74, %cst8_i32 : i32
                  %78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
                  %80 = spv.IAdd %74, %cst16_i32 : i32
                  %81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                  %82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
                  %83 = spv.IMul %48, %cst675_i32 : i32
                  %84 = spv.IMul %63, %cst3_i32 : i32
                  %85 = spv.IAdd %83, %84 : i32
                  %86 = spv.IMul %21, %cst675_i32 : i32
                  %87 = spv.IAdd %85, %86 : i32
                  %88 = spv.IMul %17, %cst675_i32 : i32
                  %89 = spv.IAdd %87, %88 : i32
                  %90 = spv.IMul %8, %cst3_i32 : i32
                  %91 = spv.IAdd %89, %90 : i32
                  %92 = spv.IMul %18, %cst3_i32 : i32
                  %93 = spv.IAdd %91, %92 : i32
                  %94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %95 = spv.Load "StorageBuffer" %94 : f32
                  %96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %97 = spv.IAdd %93, %cst1_i32 : i32
                  %98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %99 = spv.Load "StorageBuffer" %98 : f32
                  %100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
                  %101 = spv.IAdd %93, %cst2_i32 : i32
                  %102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %103 = spv.Load "StorageBuffer" %102 : f32
                  %104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
                  %105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
                  %106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
                  %107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
                  %108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
                  %109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
                  %110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
                  %111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
                  %112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
                  %113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
                  %114 = spv.IAdd %83, %86 : i32
                  %115 = spv.IAdd %114, %88 : i32
                  %116 = spv.IAdd %115, %90 : i32
                  %117 = spv.IAdd %116, %92 : i32
                  %118 = spv.IAdd %117, %84 : i32
                  %119 = spv.IAdd %118, %cst6_i32 : i32
                  %120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %121 = spv.Load "StorageBuffer" %120 : f32
                  %122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %123 = spv.IAdd %118, %cst7_i32 : i32
                  %124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %125 = spv.Load "StorageBuffer" %124 : f32
                  %126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
                  %127 = spv.IAdd %118, %cst8_i32 : i32
                  %128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %129 = spv.Load "StorageBuffer" %128 : f32
                  %130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
                  %131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
                  %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                  %133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
                  %134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
                  %135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
                  %136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
                  %137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
                  %138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
                  %139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
                  %140 = spv.IAdd %118, %cst12_i32 : i32
                  %141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %142 = spv.Load "StorageBuffer" %141 : f32
                  %143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %144 = spv.IAdd %118, %cst13_i32 : i32
                  %145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %146 = spv.Load "StorageBuffer" %145 : f32
                  %147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
                  %148 = spv.IAdd %118, %cst14_i32 : i32
                  %149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %150 = spv.Load "StorageBuffer" %149 : f32
                  %151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
                  %152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
                  %153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
                  %154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
                  %155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
                  %156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
                  %157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
                  %158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
                  %159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
                  %160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
                  %161 = spv.IAdd %118, %cst18_i32 : i32
                  %162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %163 = spv.Load "StorageBuffer" %162 : f32
                  %164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
                  %165 = spv.IAdd %118, %cst19_i32 : i32
                  %166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %167 = spv.Load "StorageBuffer" %166 : f32
                  %168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
                  %169 = spv.IAdd %118, %cst20_i32 : i32
                  %170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
                  %171 = spv.Load "StorageBuffer" %170 : f32
                  %172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
                  %173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
                  %174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
                  %175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
                  %176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
                  %177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
                  %178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
                  %179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
                  %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                  %181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
                  spv.Store "Function" %54, %113 : vector<4xf32>
                  spv.Store "Function" %55, %139 : vector<4xf32>
                  spv.Store "Function" %56, %160 : vector<4xf32>
                  spv.Store "Function" %57, %181 : vector<4xf32>
                  %182 = spv.IAdd %63, %cst1_i32 : i32
                  spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
                ^bb3:  // pred: ^bb1
                  spv.mlir.merge
                }
                %58 = spv.Load "Function" %57 : vector<4xf32>
                %59 = spv.Load "Function" %56 : vector<4xf32>
                %60 = spv.Load "Function" %55 : vector<4xf32>
                %61 = spv.Load "Function" %54 : vector<4xf32>
                spv.Store "Function" %22, %61 : vector<4xf32>
                spv.Store "Function" %23, %60 : vector<4xf32>
                spv.Store "Function" %24, %59 : vector<4xf32>
                spv.Store "Function" %25, %58 : vector<4xf32>
                %62 = spv.IAdd %48, %cst1_i32 : i32
                spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %26 = spv.Load "Function" %25 : vector<4xf32>
              %27 = spv.Load "Function" %24 : vector<4xf32>
              %28 = spv.Load "Function" %23 : vector<4xf32>
              %29 = spv.Load "Function" %22 : vector<4xf32>
              %30 = spv.IAdd %7, %16 : i32
              %31 = spv.SDiv %30, %cst4_i32 : i32
              %32 = spv.IMul %19, %cst896_i32 : i32
              %33 = spv.IAdd %31, %32 : i32
              %34 = spv.IMul %14, %cst896_i32 : i32
              %35 = spv.IAdd %33, %34 : i32
              %36 = spv.IMul %6, %cst8_i32 : i32
              %37 = spv.IAdd %35, %36 : i32
              %38 = spv.IMul %15, %cst8_i32 : i32
              %39 = spv.IAdd %37, %38 : i32
              %40 = spv.IAdd %39, %cst24_i32 : i32
              %41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
              %42 = spv.IAdd %39, %cst16_i32 : i32
              %43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
              %44 = spv.IAdd %39, %cst8_i32 : i32
              %45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
              %46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
              spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
              %47 = spv.IAdd %19, %cst112_i32 : i32
              spv.Branch ^bb1(%47 : i32)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
        }
        hal.interface private @io {
          hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c607500 = constant 607500 : index
    %c3456 = constant 3456 : index
    %c1605632 = constant 1605632 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %c14 = constant 14 : index
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %c1_i32 = constant 1 : i32
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_device_query_0_ok = util.global.load @_device_query_0_ok : i1
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
    ])
    %_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
    br ^bb3
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  ^bb3:  // pred: ^bb1
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
}

// -----// IR Dump After Canonicalizer //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
  util.global.store %ok, @_device_query_0_ok : i1
  util.global.store %value, @_device_query_0 : i1
  util.initializer.return
}

// -----// IR Dump After CSE //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
  util.global.store %ok, @_device_query_0_ok : i1
  util.global.store %value, @_device_query_0 : i1
  util.initializer.return
}

// -----// IR Dump After Canonicalizer //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
  util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer.return
}

// -----// IR Dump After Canonicalizer //----- //
util.initializer {
  %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  %device = hal.ex.shared_device : !hal.device
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  util.initializer.return
}

// -----// IR Dump After CSE //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
  util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer.return
}

// -----// IR Dump After Canonicalizer //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
  cond_br %value, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
  br ^bb3(%exe : !hal.executable)
^bb2:  // pred: ^bb0
  %0 = util.null : !hal.executable
  br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
  util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
  util.initializer.return
}

// -----// IR Dump After CSE //----- //
util.initializer {
  %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  %device = hal.ex.shared_device : !hal.device
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  util.initializer.return
}

// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c1_i32 = constant 1 : i32
  %c50331680_i32 = constant 50331680 : i32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %c1 = constant 1 : index
  %c14 = constant 14 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %c1605632 = constant 1605632 : index
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_device_query_0 = util.global.load @_device_query_0 : i1
  cond_br %_device_query_0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
  ])
  %_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
}

// -----// IR Dump After CSE //----- //
util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
  cond_br %value, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
  br ^bb3(%exe : !hal.executable)
^bb2:  // pred: ^bb0
  %0 = util.null : !hal.executable
  br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
  util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
  util.initializer.return
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeTargetExecutablesPass //----- //
hal.executable private @conv_dispatch_0 {
  hal.interface public @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.binary public @vulkan_spirv_fb attributes {data = dense<"0x0800000053505645A0E9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30008D050000030223070000010016000000F30000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000B0006008C000000474C534C2E7374642E343530000000000E00030000000000010000000F0009000500000013000000636F6E765F64697370617463685F3000050000000400000010000600130000001100000008000000020000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050007000A0000005F5F7265736F757263655F7661725F305F305F00050007000F0000005F5F7265736F757263655F7661725F305F315F0005000700100000005F5F7265736F757263655F7661725F305F325F000500060013000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000040000004800050007000000000000002300000000000000470003000700000002000000470004000A0000002100000000000000470004000A0000002200000000000000470004000D0000000600000010000000480005000C000000000000002300000000000000470003000C00000002000000470004000F0000002100000001000000470004000F0000002200000000000000470003000C0000000200000047000400100000002100000002000000470004001000000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B0004000100000005000000010000001600030009000000200000001D00030008000000090000001E000300070000000800000020000400060000000C000000070000003B000400060000000A0000000C000000170004000E00000009000000040000001D0003000D0000000E0000001E0003000C0000000D000000200004000B0000000C0000000C0000003B0004000B0000000F0000000C0000003B0004000B000000100000000C00000013000200120000002100030011000000120000002B0004000300000015000000800300002B0004000300000016000000140000002B0004000300000017000000130000002B0004000300000018000000120000002B00040003000000190000000E0000002B000400030000001A0000000D0000002B000400030000001B0000000C0000002B000400030000001C000000070000002B000400030000001D000000060000002B000400030000001E000000A30200002B000400030000001F000000180000002B0004000300000020000000480000002B0004000900000022000000000000002C0007000E0000002100000022000000220000002200000022000000170004002300000009000000030000002C00060023000000240000002200000022000000220000002B0004000300000025000000080000002B0004000300000026000000020000002B0004000300000027000000040000002B0004000300000028000000100000002B0004000300000029000000200000002B000400030000002A000000000000002B000400030000002B000000700000002B000400030000002C000000030000002B000400030000002D0000000100000014000200460000002000040049000000070000000E00000020000400690000000C0000000E000000200004007D0000000C000000090000003600050012000000130000000000000011000000F8000200140000003B000400490000004A000000070000003B000400490000004B000000070000003B000400490000004C000000070000003B000400490000004D000000070000003B0004004900000057000000070000003B0004004900000058000000070000003B0004004900000059000000070000003B000400490000005A000000070000003D000400020000002E0000000500000051000500030000002F0000002E000000000000003D00040002000000300000000500000051000500030000003100000030000000010000003D000400020000003200000005000000510005000300000033000000320000000200000084000500030000003400000031000000250000008400050003000000350000002F0000002900000084000500030000003600000031000000280000003D00040002000000370000000400000051000500030000003800000037000000000000003D00040002000000390000000400000051000500030000003A00000039000000010000003D000400020000003B0000000400000051000500030000003C0000003B0000000200000084000500030000003D0000003A0000002700000084000500030000003E000000380000002700000084000500030000003F0000003C000000260000008400050003000000400000003A00000025000000F900020041000000F800020041000000F50007000300000044000000F2000000450000003300000014000000B10005004600000047000000440000002B000000F6000400430000004200000000000000FA000400470000004200000043000000F8000200420000008400050003000000480000004400000026000000F90002004E000000F80002004E000000F50007000300000050000000DC000000510000002A00000042000000F50007000E00000052000000DB000000510000002100000042000000F50007000E00000053000000DA000000510000002100000042000000F50007000E00000054000000D9000000510000002100000042000000F50007000E00000055000000D8000000510000002100000042000000B10005004600000056000000500000002C000000F6000400450000004F00000000000000FA000400560000004F00000045000000F80002004F000000F90002005B000000F80002005B000000F5000700030000005D000000D70000005C0000002A0000004F000000F50007000E0000005E000000920000005C000000520000004F000000F50007000E0000005F000000AC0000005C000000530000004F000000F50007000E00000060000000C10000005C000000540000004F000000F50007000E00000061000000D60000005C000000550000004F000000B100050046000000620000005D0000002C000000F6000400510000005C00000000000000FA000400620000005C00000051000000F80002005C000000800005000300000063000000350000003E000000870005000300000064000000630000002700000084000500030000006500000050000000200000008400050003000000660000005D0000001F0000008000050003000000670000006500000066000000800005000300000068000000670000006400000041000600690000006A0000000F0000002A000000680000003D0004000E0000006B0000006A00000080000500030000006C000000680000002500000041000600690000006D0000000F0000002A0000006C0000003D0004000E0000006E0000006D00000080000500030000006F00000068000000280000004100060069000000700000000F0000002A0000006F0000003D0004000E0000007100000070000000840005000300000072000000500000001E0000008400050003000000730000005D0000002C0000008000050003000000740000007200000073000000840005000300000075000000480000001E00000080000500030000007600000074000000750000008400050003000000770000003F0000001E0000008000050003000000780000007600000077000000840005000300000079000000360000002C00000080000500030000007A000000780000007900000084000500030000007B000000400000002C00000080000500030000007C0000007A0000007B000000410006007D0000007E0000000A0000002A0000007C0000003D000400090000007F0000007E0000005200060023000000800000007F00000024000000000000008000050003000000810000007C0000002D000000410006007D000000820000000A0000002A000000810000003D0004000900000083000000820000005200060023000000840000008300000080000000010000008000050003000000850000007C00000026000000410006007D000000860000000A0000002A000000850000003D0004000900000087000000860000005200060023000000880000008700000084000000020000005100050009000000890000008800000000000000500007000E0000008A000000890000008900000089000000890000000C0008000E0000008B0000008C000000320000008A0000006B0000005E00000051000500090000008D0000008800000001000000500007000E0000008E0000008D0000008D0000008D0000008D0000000C0008000E0000008F0000008C000000320000008E0000006E0000008B0000005100050009000000900000008800000002000000500007000E00000091000000900000009000000090000000900000000C0008000E000000920000008C0000003200000091000000710000008F000000800005000300000093000000720000007500000080000500030000009400000093000000770000008000050003000000950000009400000079000000800005000300000096000000950000007B0000008000050003000000970000009600000073000000800005000300000098000000970000001D000000410006007D000000990000000A0000002A000000980000003D000400090000009A0000009900000052000600230000009B0000009A000000240000000000000080000500030000009C000000970000001C000000410006007D0000009D0000000A0000002A0000009C0000003D000400090000009E0000009D00000052000600230000009F0000009E0000009B000000010000008000050003000000A00000009700000025000000410006007D000000A10000000A0000002A000000A00000003D00040009000000A2000000A10000005200060023000000A3000000A20000009F000000020000005100050009000000A4000000A300000000000000500007000E000000A5000000A4000000A4000000A4000000A40000000C0008000E000000A60000008C00000032000000A50000006B0000005F0000005100050009000000A7000000A300000001000000500007000E000000A8000000A7000000A7000000A7000000A70000000C0008000E000000A90000008C00000032000000A80000006E000000A60000005100050009000000AA000000A300000002000000500007000E000000AB000000AA000000AA000000AA000000AA0000000C0008000E000000AC0000008C00000032000000AB00000071000000A90000008000050003000000AD000000970000001B000000410006007D000000AE0000000A0000002A000000AD0000003D00040009000000AF000000AE0000005200060023000000B0000000AF00000024000000000000008000050003000000B1000000970000001A000000410006007D000000B20000000A0000002A000000B10000003D00040009000000B3000000B20000005200060023000000B4000000B3000000B0000000010000008000050003000000B50000009700000019000000410006007D000000B60000000A0000002A000000B50000003D00040009000000B7000000B60000005200060023000000B8000000B7000000B4000000020000005100050009000000B9000000B800000000000000500007000E000000BA000000B9000000B9000000B9000000B90000000C0008000E000000BB0000008C00000032000000BA0000006B000000600000005100050009000000BC000000B800000001000000500007000E000000BD000000BC000000BC000000BC000000BC0000000C0008000E000000BE0000008C00000032000000BD0000006E000000BB0000005100050009000000BF000000B800000002000000500007000E000000C0000000BF000000BF000000BF000000BF0000000C0008000E000000C10000008C00000032000000C000000071000000BE0000008000050003000000C20000009700000018000000410006007D000000C30000000A0000002A000000C20000003D00040009000000C4000000C30000005200060023000000C5000000C400000024000000000000008000050003000000C60000009700000017000000410006007D000000C70000000A0000002A000000C60000003D00040009000000C8000000C70000005200060023000000C9000000C8000000C5000000010000008000050003000000CA0000009700000016000000410006007D000000CB0000000A0000002A000000CA0000003D00040009000000CC000000CB0000005200060023000000CD000000CC000000C9000000020000005100050009000000CE000000CD00000000000000500007000E000000CF000000CE000000CE000000CE000000CE0000000C0008000E000000D00000008C00000032000000CF0000006B000000610000005100050009000000D1000000CD00000001000000500007000E000000D2000000D1000000D1000000D1000000D10000000C0008000E000000D30000008C00000032000000D20000006E000000D00000005100050009000000D4000000CD00000002000000500007000E000000D5000000D4000000D4000000D4000000D40000000C0008000E000000D60000008C00000032000000D500000071000000D30000003E00030057000000920000003E00030058000000AC0000003E00030059000000C10000003E0003005A000000D60000008000050003000000D70000005D0000002D000000F90002005B000000F8000200510000003D0004000E000000D80000005A0000003D0004000E000000D9000000590000003D0004000E000000DA000000580000003D0004000E000000DB000000570000003E0003004A000000DB0000003E0003004B000000DA0000003E0003004C000000D90000003E0003004D000000D80000008000050003000000DC000000500000002D000000F90002004E000000F8000200450000003D0004000E000000DD0000004D0000003D0004000E000000DE0000004C0000003D0004000E000000DF0000004B0000003D0004000E000000E00000004A0000008000050003000000E1000000350000003E0000008700050003000000E2000000E1000000270000008400050003000000E300000044000000150000008000050003000000E4000000E2000000E30000008400050003000000E50000003C000000150000008000050003000000E6000000E4000000E50000008400050003000000E700000034000000250000008000050003000000E8000000E6000000E70000008400050003000000E90000003D000000250000008000050003000000EA000000E8000000E90000008000050003000000EB000000EA0000001F0000004100060069000000EC000000100000002A000000EB0000003E000300EC000000DD0000008000050003000000ED000000EA000000280000004100060069000000EE000000100000002A000000ED0000003E000300EE000000DE0000008000050003000000EF000000EA000000250000004100060069000000F0000000100000002A000000EF0000003E000300F0000000DF0000004100060069000000F1000000100000002A000000EA0000003E000300F1000000E00000008000050003000000F2000000440000002B000000F900020041000000F800020043000000FD0001003800010008000C0004000800"> : vector<5744xi8>, format = "vulkan-spirv-fb", mime_type = "application/x-flatbuffers"}
}

// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeExecutablesPass //----- //
hal.executable private @conv_dispatch_0 {
  hal.interface public @io {
    hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.binary public @vulkan_spirv_fb attributes {data = dense<"0x0800000053505645A0E9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30008D050000030223070000010016000000F30000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000B0006008C000000474C534C2E7374642E343530000000000E00030000000000010000000F0009000500000013000000636F6E765F64697370617463685F3000050000000400000010000600130000001100000008000000020000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050007000A0000005F5F7265736F757263655F7661725F305F305F00050007000F0000005F5F7265736F757263655F7661725F305F315F0005000700100000005F5F7265736F757263655F7661725F305F325F000500060013000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000040000004800050007000000000000002300000000000000470003000700000002000000470004000A0000002100000000000000470004000A0000002200000000000000470004000D0000000600000010000000480005000C000000000000002300000000000000470003000C00000002000000470004000F0000002100000001000000470004000F0000002200000000000000470003000C0000000200000047000400100000002100000002000000470004001000000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B0004000100000005000000010000001600030009000000200000001D00030008000000090000001E000300070000000800000020000400060000000C000000070000003B000400060000000A0000000C000000170004000E00000009000000040000001D0003000D0000000E0000001E0003000C0000000D000000200004000B0000000C0000000C0000003B0004000B0000000F0000000C0000003B0004000B000000100000000C00000013000200120000002100030011000000120000002B0004000300000015000000800300002B0004000300000016000000140000002B0004000300000017000000130000002B0004000300000018000000120000002B00040003000000190000000E0000002B000400030000001A0000000D0000002B000400030000001B0000000C0000002B000400030000001C000000070000002B000400030000001D000000060000002B000400030000001E000000A30200002B000400030000001F000000180000002B0004000300000020000000480000002B0004000900000022000000000000002C0007000E0000002100000022000000220000002200000022000000170004002300000009000000030000002C00060023000000240000002200000022000000220000002B0004000300000025000000080000002B0004000300000026000000020000002B0004000300000027000000040000002B0004000300000028000000100000002B0004000300000029000000200000002B000400030000002A000000000000002B000400030000002B000000700000002B000400030000002C000000030000002B000400030000002D0000000100000014000200460000002000040049000000070000000E00000020000400690000000C0000000E000000200004007D0000000C000000090000003600050012000000130000000000000011000000F8000200140000003B000400490000004A000000070000003B000400490000004B000000070000003B000400490000004C000000070000003B000400490000004D000000070000003B0004004900000057000000070000003B0004004900000058000000070000003B0004004900000059000000070000003B000400490000005A000000070000003D000400020000002E0000000500000051000500030000002F0000002E000000000000003D00040002000000300000000500000051000500030000003100000030000000010000003D000400020000003200000005000000510005000300000033000000320000000200000084000500030000003400000031000000250000008400050003000000350000002F0000002900000084000500030000003600000031000000280000003D00040002000000370000000400000051000500030000003800000037000000000000003D00040002000000390000000400000051000500030000003A00000039000000010000003D000400020000003B0000000400000051000500030000003C0000003B0000000200000084000500030000003D0000003A0000002700000084000500030000003E000000380000002700000084000500030000003F0000003C000000260000008400050003000000400000003A00000025000000F900020041000000F800020041000000F50007000300000044000000F2000000450000003300000014000000B10005004600000047000000440000002B000000F6000400430000004200000000000000FA000400470000004200000043000000F8000200420000008400050003000000480000004400000026000000F90002004E000000F80002004E000000F50007000300000050000000DC000000510000002A00000042000000F50007000E00000052000000DB000000510000002100000042000000F50007000E00000053000000DA000000510000002100000042000000F50007000E00000054000000D9000000510000002100000042000000F50007000E00000055000000D8000000510000002100000042000000B10005004600000056000000500000002C000000F6000400450000004F00000000000000FA000400560000004F00000045000000F80002004F000000F90002005B000000F80002005B000000F5000700030000005D000000D70000005C0000002A0000004F000000F50007000E0000005E000000920000005C000000520000004F000000F50007000E0000005F000000AC0000005C000000530000004F000000F50007000E00000060000000C10000005C000000540000004F000000F50007000E00000061000000D60000005C000000550000004F000000B100050046000000620000005D0000002C000000F6000400510000005C00000000000000FA000400620000005C00000051000000F80002005C000000800005000300000063000000350000003E000000870005000300000064000000630000002700000084000500030000006500000050000000200000008400050003000000660000005D0000001F0000008000050003000000670000006500000066000000800005000300000068000000670000006400000041000600690000006A0000000F0000002A000000680000003D0004000E0000006B0000006A00000080000500030000006C000000680000002500000041000600690000006D0000000F0000002A0000006C0000003D0004000E0000006E0000006D00000080000500030000006F00000068000000280000004100060069000000700000000F0000002A0000006F0000003D0004000E0000007100000070000000840005000300000072000000500000001E0000008400050003000000730000005D0000002C0000008000050003000000740000007200000073000000840005000300000075000000480000001E00000080000500030000007600000074000000750000008400050003000000770000003F0000001E0000008000050003000000780000007600000077000000840005000300000079000000360000002C00000080000500030000007A000000780000007900000084000500030000007B000000400000002C00000080000500030000007C0000007A0000007B000000410006007D0000007E0000000A0000002A0000007C0000003D000400090000007F0000007E0000005200060023000000800000007F00000024000000000000008000050003000000810000007C0000002D000000410006007D000000820000000A0000002A000000810000003D0004000900000083000000820000005200060023000000840000008300000080000000010000008000050003000000850000007C00000026000000410006007D000000860000000A0000002A000000850000003D0004000900000087000000860000005200060023000000880000008700000084000000020000005100050009000000890000008800000000000000500007000E0000008A000000890000008900000089000000890000000C0008000E0000008B0000008C000000320000008A0000006B0000005E00000051000500090000008D0000008800000001000000500007000E0000008E0000008D0000008D0000008D0000008D0000000C0008000E0000008F0000008C000000320000008E0000006E0000008B0000005100050009000000900000008800000002000000500007000E00000091000000900000009000000090000000900000000C0008000E000000920000008C0000003200000091000000710000008F000000800005000300000093000000720000007500000080000500030000009400000093000000770000008000050003000000950000009400000079000000800005000300000096000000950000007B0000008000050003000000970000009600000073000000800005000300000098000000970000001D000000410006007D000000990000000A0000002A000000980000003D000400090000009A0000009900000052000600230000009B0000009A000000240000000000000080000500030000009C000000970000001C000000410006007D0000009D0000000A0000002A0000009C0000003D000400090000009E0000009D00000052000600230000009F0000009E0000009B000000010000008000050003000000A00000009700000025000000410006007D000000A10000000A0000002A000000A00000003D00040009000000A2000000A10000005200060023000000A3000000A20000009F000000020000005100050009000000A4000000A300000000000000500007000E000000A5000000A4000000A4000000A4000000A40000000C0008000E000000A60000008C00000032000000A50000006B0000005F0000005100050009000000A7000000A300000001000000500007000E000000A8000000A7000000A7000000A7000000A70000000C0008000E000000A90000008C00000032000000A80000006E000000A60000005100050009000000AA000000A300000002000000500007000E000000AB000000AA000000AA000000AA000000AA0000000C0008000E000000AC0000008C00000032000000AB00000071000000A90000008000050003000000AD000000970000001B000000410006007D000000AE0000000A0000002A000000AD0000003D00040009000000AF000000AE0000005200060023000000B0000000AF00000024000000000000008000050003000000B1000000970000001A000000410006007D000000B20000000A0000002A000000B10000003D00040009000000B3000000B20000005200060023000000B4000000B3000000B0000000010000008000050003000000B50000009700000019000000410006007D000000B60000000A0000002A000000B50000003D00040009000000B7000000B60000005200060023000000B8000000B7000000B4000000020000005100050009000000B9000000B800000000000000500007000E000000BA000000B9000000B9000000B9000000B90000000C0008000E000000BB0000008C00000032000000BA0000006B000000600000005100050009000000BC000000B800000001000000500007000E000000BD000000BC000000BC000000BC000000BC0000000C0008000E000000BE0000008C00000032000000BD0000006E000000BB0000005100050009000000BF000000B800000002000000500007000E000000C0000000BF000000BF000000BF000000BF0000000C0008000E000000C10000008C00000032000000C000000071000000BE0000008000050003000000C20000009700000018000000410006007D000000C30000000A0000002A000000C20000003D00040009000000C4000000C30000005200060023000000C5000000C400000024000000000000008000050003000000C60000009700000017000000410006007D000000C70000000A0000002A000000C60000003D00040009000000C8000000C70000005200060023000000C9000000C8000000C5000000010000008000050003000000CA0000009700000016000000410006007D000000CB0000000A0000002A000000CA0000003D00040009000000CC000000CB0000005200060023000000CD000000CC000000C9000000020000005100050009000000CE000000CD00000000000000500007000E000000CF000000CE000000CE000000CE000000CE0000000C0008000E000000D00000008C00000032000000CF0000006B000000610000005100050009000000D1000000CD00000001000000500007000E000000D2000000D1000000D1000000D1000000D10000000C0008000E000000D30000008C00000032000000D20000006E000000D00000005100050009000000D4000000CD00000002000000500007000E000000D5000000D4000000D4000000D4000000D40000000C0008000E000000D60000008C00000032000000D500000071000000D30000003E00030057000000920000003E00030058000000AC0000003E00030059000000C10000003E0003005A000000D60000008000050003000000D70000005D0000002D000000F90002005B000000F8000200510000003D0004000E000000D80000005A0000003D0004000E000000D9000000590000003D0004000E000000DA000000580000003D0004000E000000DB000000570000003E0003004A000000DB0000003E0003004B000000DA0000003E0003004C000000D90000003E0003004D000000D80000008000050003000000DC000000500000002D000000F90002004E000000F8000200450000003D0004000E000000DD0000004D0000003D0004000E000000DE0000004C0000003D0004000E000000DF0000004B0000003D0004000E000000E00000004A0000008000050003000000E1000000350000003E0000008700050003000000E2000000E1000000270000008400050003000000E300000044000000150000008000050003000000E4000000E2000000E30000008400050003000000E50000003C000000150000008000050003000000E6000000E4000000E50000008400050003000000E700000034000000250000008000050003000000E8000000E6000000E70000008400050003000000E90000003D000000250000008000050003000000EA000000E8000000E90000008000050003000000EB000000EA0000001F0000004100060069000000EC000000100000002A000000EB0000003E000300EC000000DD0000008000050003000000ED000000EA000000280000004100060069000000EE000000100000002A000000ED0000003E000300EE000000DE0000008000050003000000EF000000EA000000250000004100060069000000F0000000100000002A000000EF0000003E000300F0000000DF0000004100060069000000F1000000100000002A000000EA0000003E000300F1000000E00000008000050003000000F2000000440000002B000000F900020041000000F800020043000000FD0001003800010008000C0004000800"> : vector<5744xi8>, format = "vulkan-spirv-fb", mime_type = "application/x-flatbuffers"}
}

// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c1_i32 = constant 1 : i32
  %c50331680_i32 = constant 50331680 : i32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %c1 = constant 1 : index
  %c14 = constant 14 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %c1605632 = constant 1605632 : index
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_device_query_0 = util.global.load @_device_query_0 : i1
  cond_br %_device_query_0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
  ])
  %_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
}

// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
  %c1_i32 = constant 1 : i32
  %c50331680_i32 = constant 50331680 : i32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %c1 = constant 1 : index
  %c14 = constant 14 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %c1605632 = constant 1605632 : index
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  cond_br %_device_query_0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c607500],
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
}

// -----// IR Dump After SymbolDCE //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_po