Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Last active September 19, 2021 21:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antiagainst/f84a6d261b05385a2eeed42e6c6f1a1f to your computer and use it in GitHub Desktop.
Save antiagainst/f84a6d261b05385a2eeed42e6c6f1a1f to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Inliner //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After TopLevelSCFToCFG //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertShapeToStandard //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Inliner //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After LegalizeInputTypes //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%cst = constant 0.000000e+00 : f32
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After IREEImportPublic //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After VerifyInputLegality //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After ExpandGlobalDynamicDims //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionDynamicDimsPass //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After PadTensorToSubTensorInsert //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After InterchangeGenericOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After FusionOfTensorOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112_0 = constant 112 : index
%c32_1 = constant 32 : index
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %5 to %c112_0 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %7 to %c112_0 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %9 to %c32_1 step %10 {
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112_0 = constant 112 : index
%c32_1 = constant 32 : index
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %5 to %c112_0 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %7 to %c112_0 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %9 to %c32_1 step %10 {
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112_0 = constant 112 : index
%c32_1 = constant 32 : index
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %5 to %c112_0 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %7 to %c112_0 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %9 to %c32_1 step %10 {
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32_0 = constant 32 : index
%c112_1 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %4 to %c112_1 step %5 {
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %6 to %c112_1 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %8 to %c32_0 step %9 {
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%11 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%13 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%14 = flow.dispatch.tensor.load %arg2, offsets = [0, %10, %12, 0], sizes = [1, %11, %13, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%16 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %15], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%17 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%23 = linalg.init_tensor [1, %20, %21, %22] : tensor<1x?x?x?xf32>
%24 = linalg.fill(%cst, %23) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%25 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%14, %16 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%24 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %25, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %17, %18, %19], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After OutlineDispatchRegions //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After DeduplicateExecutables //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After HoistUnstreamableOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%c32 = constant 32 : index
%c112 = constant 112 : index
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After InsertConstantClones //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After FormStreams //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After OutlineLargeConstants //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::IdentifyConstantPoolsPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeConstantPoolBuffersPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After LinalgBufferize //----- //
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = flow.dispatch.tensor.load %1, offsets = [0, %12, %14, 0], sizes = [1, %13, %15, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %18] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%20 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %18], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%22 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%23 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%24 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%25 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%26 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%27 = linalg.init_tensor [1, %24, %25, %26] : tensor<1x?x?x?xf32>
%28 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%29 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%30 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%31 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %28, %29, %30] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %31) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = linalg.fill(%cst, %27) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %19 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%31 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
%33 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%17, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%32 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
}
}
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %17] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %21) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%21 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %18) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After SetNumWorkgroups //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c112 = constant 112 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c1]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %c1]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c8]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c8]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c32]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c32]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (3, d0 * -2 + 227)>(%arg0)[%c1]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (17, d0 * -2 + 227)>(%arg1)[%c8]
%13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%14 = affine.min affine_map<(d0)[s0] -> (32, -d0 + 32)>(%arg2)[%c32]
%15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%16 = affine.min affine_map<(d0)[s0] -> (1, -d0 + 112)>(%arg0)[%c1]
%17 = affine.min affine_map<(d0)[s0] -> (8, -d0 + 112)>(%arg1)[%c8]
%18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %18) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After Canonicalizer //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg1 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %5 to %c32 step %6 {
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg1 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %5 to %c32 step %6 {
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg1 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %5 to %c32 step %6 {
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After SPIRVRemoveOneTripTiledLoop //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%6 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%3)
%8 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%3)
%9 = memref.subview %0[0, %5, %7, 0] [1, %6, %8, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%10 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%4)
%11 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, %10] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%12 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%13 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%3)
%14 = memref.subview %2[0, %arg0, %3, %4] [1, %12, %13, %10] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %14) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%9, %11 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%14 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
return
}
// -----// IR Dump After SPIRVTileAndDistribute //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%c3 = constant 3 : index
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%7 = memref.subview %0[0, %5, %6, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%8 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%9 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%10 = "gpu.thread_id"() {dimension = "x"} : () -> index
%11 = "gpu.thread_id"() {dimension = "y"} : () -> index
%12 = "gpu.thread_id"() {dimension = "z"} : () -> index
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%10]
%15 = memref.subview %9[0, %12, %13, %14] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %15) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%16 = "gpu.thread_id"() {dimension = "x"} : () -> index
%17 = "gpu.thread_id"() {dimension = "y"} : () -> index
%18 = "gpu.thread_id"() {dimension = "z"} : () -> index
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17]
%23 = memref.subview %7[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%24 = memref.subview %8[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = memref.subview %9[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
scf.for %arg1 = %c0 to %c3 step %c1 {
scf.for %arg2 = %c0 to %c3 step %c1 {
%26 = memref.subview %23[0, %arg1, %arg2, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = memref.subview %24[%arg1, %arg2, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "vectorize", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%26, %27 : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%25 : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After SPIRVVectorize //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%cst = constant dense<0.000000e+00> : vector<1x1x4x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%cst_1 = constant dense<0.000000e+00> : vector<1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = vector.extract_strided_slice %cst {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%13 = vector.extract_strided_slice %cst {offsets = [0, 0, 1, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%14 = vector.extract_strided_slice %cst {offsets = [0, 0, 2, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%15 = vector.extract_strided_slice %cst {offsets = [0, 0, 3, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%16 = "gpu.thread_id"() {dimension = "x"} : () -> index
%17 = "gpu.thread_id"() {dimension = "y"} : () -> index
%18 = "gpu.thread_id"() {dimension = "z"} : () -> index
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17]
%23 = memref.subview %6[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%24 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%25 = memref.subview %0[0, %24, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%26 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%27 = memref.subview %26[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %12, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %13, %27[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %14, %27[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %15, %27[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%28 = memref.subview %25[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%29 = memref.subview %26[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%30 = vector.transfer_read %29[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %29[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%32 = vector.transfer_read %29[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%33 = vector.transfer_read %29[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%34:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %30, %arg3 = %31, %arg4 = %32, %arg5 = %33) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%35:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%36 = memref.subview %28[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%37 = memref.subview %23[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%38 = vector.transfer_read %37[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%39 = vector.transfer_read %37[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%40 = vector.transfer_read %37[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%41 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%42 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%43 = vector.extract %38[0] : vector<1x4xf32>
%44 = vector.extract %42[0, 0] : vector<1x1xf32>
%45 = splat %44 : vector<4xf32>
%46 = vector.extract %arg7[0] : vector<1x4xf32>
%47 = vector.fma %45, %43, %46 : vector<4xf32>
%48 = vector.extract_strided_slice %41 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%49 = vector.extract %39[0] : vector<1x4xf32>
%50 = vector.extract %48[0, 0] : vector<1x1xf32>
%51 = splat %50 : vector<4xf32>
%52 = vector.fma %51, %49, %47 : vector<4xf32>
%53 = vector.extract_strided_slice %41 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%54 = vector.extract %40[0] : vector<1x4xf32>
%55 = vector.extract %53[0, 0] : vector<1x1xf32>
%56 = splat %55 : vector<4xf32>
%57 = vector.fma %56, %54, %52 : vector<4xf32>
%58 = vector.insert %57, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%59 = vector.transfer_read %36[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%60 = vector.extract_strided_slice %59 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.extract %38[0] : vector<1x4xf32>
%62 = vector.extract %60[0, 0] : vector<1x1xf32>
%63 = splat %62 : vector<4xf32>
%64 = vector.extract %arg8[0] : vector<1x4xf32>
%65 = vector.fma %63, %61, %64 : vector<4xf32>
%66 = vector.extract_strided_slice %59 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%67 = vector.extract %39[0] : vector<1x4xf32>
%68 = vector.extract %66[0, 0] : vector<1x1xf32>
%69 = splat %68 : vector<4xf32>
%70 = vector.fma %69, %67, %65 : vector<4xf32>
%71 = vector.extract_strided_slice %59 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%72 = vector.extract %40[0] : vector<1x4xf32>
%73 = vector.extract %71[0, 0] : vector<1x1xf32>
%74 = splat %73 : vector<4xf32>
%75 = vector.fma %74, %72, %70 : vector<4xf32>
%76 = vector.insert %75, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%77 = vector.transfer_read %36[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%78 = vector.extract_strided_slice %77 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%79 = vector.extract %38[0] : vector<1x4xf32>
%80 = vector.extract %78[0, 0] : vector<1x1xf32>
%81 = splat %80 : vector<4xf32>
%82 = vector.extract %arg9[0] : vector<1x4xf32>
%83 = vector.fma %81, %79, %82 : vector<4xf32>
%84 = vector.extract_strided_slice %77 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%85 = vector.extract %39[0] : vector<1x4xf32>
%86 = vector.extract %84[0, 0] : vector<1x1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.fma %87, %85, %83 : vector<4xf32>
%89 = vector.extract_strided_slice %77 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%90 = vector.extract %40[0] : vector<1x4xf32>
%91 = vector.extract %89[0, 0] : vector<1x1xf32>
%92 = splat %91 : vector<4xf32>
%93 = vector.fma %92, %90, %88 : vector<4xf32>
%94 = vector.insert %93, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%95 = vector.transfer_read %36[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%96 = vector.extract_strided_slice %95 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%97 = vector.extract %38[0] : vector<1x4xf32>
%98 = vector.extract %96[0, 0] : vector<1x1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.extract %arg10[0] : vector<1x4xf32>
%101 = vector.fma %99, %97, %100 : vector<4xf32>
%102 = vector.extract_strided_slice %95 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%103 = vector.extract %39[0] : vector<1x4xf32>
%104 = vector.extract %102[0, 0] : vector<1x1xf32>
%105 = splat %104 : vector<4xf32>
%106 = vector.fma %105, %103, %101 : vector<4xf32>
%107 = vector.extract_strided_slice %95 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%108 = vector.extract %40[0] : vector<1x4xf32>
%109 = vector.extract %107[0, 0] : vector<1x1xf32>
%110 = splat %109 : vector<4xf32>
%111 = vector.fma %110, %108, %106 : vector<4xf32>
%112 = vector.insert %111, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
scf.yield %58, %76, %94, %112 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %35#0, %35#1, %35#2, %35#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %34#3, %29[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %34#2, %29[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %34#1, %29[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %34#0, %29[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c6 = constant 6 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %81, %79 : vector<4xf32>
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%87 = vector.extract %85[0, 0] : vector<1x1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %86, %84 : vector<4xf32>
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%94 = vector.extract %92[0, 0] : vector<1x1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%97 = vector.fma %95, %93, %96 : vector<4xf32>
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%100 = vector.extract %98[0, 0] : vector<1x1xf32>
%101 = splat %100 : vector<4xf32>
%102 = vector.fma %101, %99, %97 : vector<4xf32>
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%105 = vector.extract %103[0, 0] : vector<1x1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %104, %102 : vector<4xf32>
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After SPIRVCopyToWorkgroupMemory //----- //
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c6 = constant 6 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %81, %79 : vector<4xf32>
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%87 = vector.extract %85[0, 0] : vector<1x1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %86, %84 : vector<4xf32>
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%94 = vector.extract %92[0, 0] : vector<1x1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%97 = vector.fma %95, %93, %96 : vector<4xf32>
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%100 = vector.extract %98[0, 0] : vector<1x1xf32>
%101 = splat %100 : vector<4xf32>
%102 = vector.fma %101, %99, %97 : vector<4xf32>
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%105 = vector.extract %103[0, 0] : vector<1x1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %104, %102 : vector<4xf32>
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
// -----// IR Dump After LinalgExtToLoops //----- //
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c6 = constant 6 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %81, %79 : vector<4xf32>
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%87 = vector.extract %85[0, 0] : vector<1x1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %86, %84 : vector<4xf32>
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%94 = vector.extract %92[0, 0] : vector<1x1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%97 = vector.fma %95, %93, %96 : vector<4xf32>
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%100 = vector.extract %98[0, 0] : vector<1x1xf32>
%101 = splat %100 : vector<4xf32>
%102 = vector.fma %101, %99, %97 : vector<4xf32>
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%105 = vector.extract %103[0, 0] : vector<1x1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %104, %102 : vector<4xf32>
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
// -----// IR Dump After LinalgLowerToLoops //----- //
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c6 = constant 6 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %81, %79 : vector<4xf32>
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%87 = vector.extract %85[0, 0] : vector<1x1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %86, %84 : vector<4xf32>
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%94 = vector.extract %92[0, 0] : vector<1x1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%97 = vector.fma %95, %93, %96 : vector<4xf32>
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%100 = vector.extract %98[0, 0] : vector<1x1xf32>
%101 = splat %100 : vector<4xf32>
%102 = vector.fma %101, %99, %97 : vector<4xf32>
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%105 = vector.extract %103[0, 0] : vector<1x1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %104, %102 : vector<4xf32>
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
// -----// IR Dump After ConvertAffineToStandard //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c6 = constant 6 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%c8 = constant 8 : index
%3 = muli %workgroup_id_y, %c8 : index
%c32 = constant 32 : index
%4 = muli %workgroup_id_x, %c32 : index
%c16 = constant 16 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%c4_1 = constant 4 : index
%10 = muli %8, %c4_1 : index
%c4_2 = constant 4 : index
%11 = muli %7, %c4_2 : index
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%c4_3 = constant 4 : index
%15 = muli %13, %c4_3 : index
%c4_4 = constant 4 : index
%16 = muli %12, %c4_4 : index
%c2_5 = constant 2 : index
%17 = muli %14, %c2_5 : index
%c8_6 = constant 8 : index
%18 = muli %13, %c8_6 : index
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%c2_7 = constant 2 : index
%20 = muli %arg0, %c2_7 : index
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %81, %79 : vector<4xf32>
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%87 = vector.extract %85[0, 0] : vector<1x1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %86, %84 : vector<4xf32>
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%94 = vector.extract %92[0, 0] : vector<1x1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%97 = vector.fma %95, %93, %96 : vector<4xf32>
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%100 = vector.extract %98[0, 0] : vector<1x1xf32>
%101 = splat %100 : vector<4xf32>
%102 = vector.fma %101, %99, %97 : vector<4xf32>
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%105 = vector.extract %103[0, 0] : vector<1x1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %104, %102 : vector<4xf32>
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = muli %8, %c4 : index
%11 = muli %7, %c4 : index
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%15 = muli %13, %c4 : index
%16 = muli %12, %c4 : index
%17 = muli %14, %c2 : index
%18 = muli %13, %c8 : index
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%20 = muli %arg0, %c2 : index
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %81, %79 : vector<4xf32>
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%87 = vector.extract %85[0, 0] : vector<1x1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %86, %84 : vector<4xf32>
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%94 = vector.extract %92[0, 0] : vector<1x1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%97 = vector.fma %95, %93, %96 : vector<4xf32>
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%100 = vector.extract %98[0, 0] : vector<1x1xf32>
%101 = splat %100 : vector<4xf32>
%102 = vector.fma %101, %99, %97 : vector<4xf32>
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%105 = vector.extract %103[0, 0] : vector<1x1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %104, %102 : vector<4xf32>
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
func @conv_dispatch_0() {
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = muli %8, %c4 : index
%11 = muli %7, %c4 : index
%12 = muli %9, %c2 : index
%13 = muli %8, %c8 : index
%14 = memref.subview %6[0, 0, 0, %11] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%15 = muli %arg0, %c2 : index
%16 = memref.subview %0[0, %15, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%18 = memref.subview %17[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %18[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %18[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %18[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %18[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%19 = memref.subview %16[0, %12, %13, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%20 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%21 = vector.transfer_read %18[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%22 = vector.transfer_read %18[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%23 = vector.transfer_read %18[%c0, %c0, %c3, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%24:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %20, %arg3 = %21, %arg4 = %22, %arg5 = %23) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%25:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%26 = memref.subview %19[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = memref.subview %14[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %27[%c0, %c0, %c1, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%30 = vector.transfer_read %27[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%32 = vector.extract_strided_slice %31 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%33 = vector.shape_cast %28 : vector<1x4xf32> to vector<4xf32>
%34 = vector.extract %32[0, 0] : vector<1x1xf32>
%35 = splat %34 : vector<4xf32>
%36 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%37 = vector.fma %35, %33, %36 : vector<4xf32>
%38 = vector.extract_strided_slice %31 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %29 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.fma %41, %39, %37 : vector<4xf32>
%43 = vector.extract_strided_slice %31 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%44 = vector.shape_cast %30 : vector<1x4xf32> to vector<4xf32>
%45 = vector.extract %43[0, 0] : vector<1x1xf32>
%46 = splat %45 : vector<4xf32>
%47 = vector.fma %46, %44, %42 : vector<4xf32>
%48 = vector.shape_cast %47 : vector<4xf32> to vector<1x4xf32>
%49 = vector.transfer_read %26[%c0, %c0, %c2, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%50 = vector.extract_strided_slice %49 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%51 = vector.extract %50[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%54 = vector.fma %52, %33, %53 : vector<4xf32>
%55 = vector.extract_strided_slice %49 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%56 = vector.extract %55[0, 0] : vector<1x1xf32>
%57 = splat %56 : vector<4xf32>
%58 = vector.fma %57, %39, %54 : vector<4xf32>
%59 = vector.extract_strided_slice %49 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%60 = vector.extract %59[0, 0] : vector<1x1xf32>
%61 = splat %60 : vector<4xf32>
%62 = vector.fma %61, %44, %58 : vector<4xf32>
%63 = vector.shape_cast %62 : vector<4xf32> to vector<1x4xf32>
%64 = vector.transfer_read %26[%c0, %c0, %c4, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%66 = vector.extract %65[0, 0] : vector<1x1xf32>
%67 = splat %66 : vector<4xf32>
%68 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%69 = vector.fma %67, %33, %68 : vector<4xf32>
%70 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%71 = vector.extract %70[0, 0] : vector<1x1xf32>
%72 = splat %71 : vector<4xf32>
%73 = vector.fma %72, %39, %69 : vector<4xf32>
%74 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.extract %74[0, 0] : vector<1x1xf32>
%76 = splat %75 : vector<4xf32>
%77 = vector.fma %76, %44, %73 : vector<4xf32>
%78 = vector.shape_cast %77 : vector<4xf32> to vector<1x4xf32>
%79 = vector.transfer_read %26[%c0, %c0, %c6, %c0], %cst {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%80 = vector.extract_strided_slice %79 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.extract %80[0, 0] : vector<1x1xf32>
%82 = splat %81 : vector<4xf32>
%83 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%84 = vector.fma %82, %33, %83 : vector<4xf32>
%85 = vector.extract_strided_slice %79 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.extract %85[0, 0] : vector<1x1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.fma %87, %39, %84 : vector<4xf32>
%89 = vector.extract_strided_slice %79 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%90 = vector.extract %89[0, 0] : vector<1x1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.fma %91, %44, %88 : vector<4xf32>
%93 = vector.shape_cast %92 : vector<4xf32> to vector<1x4xf32>
scf.yield %48, %63, %78, %93 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %25#0, %25#1, %25#2, %25#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %24#3, %18[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %24#2, %18[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %24#1, %18[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %24#0, %18[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After OptimizeVectorTransfer //----- //
func @conv_dispatch_0() {
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = muli %8, %c4 : index
%11 = muli %7, %c4 : index
%12 = muli %9, %c2 : index
%13 = muli %8, %c8 : index
%14 = memref.subview %6[0, 0, 0, %11] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%19 = muli %arg0, %c2 : index
%20 = memref.subview %0[0, %19, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%21 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%22 = memref.subview %21[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %20[0, %12, %13, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%24 = vector.shape_cast %15 : vector<4xf32> to vector<1x4xf32>
%25 = vector.shape_cast %16 : vector<4xf32> to vector<1x4xf32>
%26 = vector.shape_cast %17 : vector<4xf32> to vector<1x4xf32>
%27 = vector.shape_cast %18 : vector<4xf32> to vector<1x4xf32>
%28:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %24, %arg3 = %25, %arg4 = %26, %arg5 = %27) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%33:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%34 = memref.subview %23[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%35 = memref.subview %14[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%40 = vector.extract_strided_slice %39 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%41 = vector.extract %40[0] : vector<1xf32>
%42 = splat %41 : vector<4xf32>
%43 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%44 = vector.fma %42, %36, %43 : vector<4xf32>
%45 = vector.extract_strided_slice %39 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%46 = vector.extract %45[0] : vector<1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %37, %44 : vector<4xf32>
%49 = vector.extract_strided_slice %39 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%50 = vector.extract %49[0] : vector<1xf32>
%51 = splat %50 : vector<4xf32>
%52 = vector.fma %51, %38, %48 : vector<4xf32>
%53 = vector.shape_cast %52 : vector<4xf32> to vector<1x4xf32>
%54 = vector.transfer_read %34[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%55 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%56 = vector.extract %55[0] : vector<1xf32>
%57 = splat %56 : vector<4xf32>
%58 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%59 = vector.fma %57, %36, %58 : vector<4xf32>
%60 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%61 = vector.extract %60[0] : vector<1xf32>
%62 = splat %61 : vector<4xf32>
%63 = vector.fma %62, %37, %59 : vector<4xf32>
%64 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%65 = vector.extract %64[0] : vector<1xf32>
%66 = splat %65 : vector<4xf32>
%67 = vector.fma %66, %38, %63 : vector<4xf32>
%68 = vector.shape_cast %67 : vector<4xf32> to vector<1x4xf32>
%69 = vector.transfer_read %34[%c0, %c0, %c4, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract %70[0] : vector<1xf32>
%72 = splat %71 : vector<4xf32>
%73 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%74 = vector.fma %72, %36, %73 : vector<4xf32>
%75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%76 = vector.extract %75[0] : vector<1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.fma %77, %37, %74 : vector<4xf32>
%79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%80 = vector.extract %79[0] : vector<1xf32>
%81 = splat %80 : vector<4xf32>
%82 = vector.fma %81, %38, %78 : vector<4xf32>
%83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
%84 = vector.transfer_read %34[%c0, %c0, %c6, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%85 = vector.extract_strided_slice %84 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%86 = vector.extract %85[0] : vector<1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%89 = vector.fma %87, %36, %88 : vector<4xf32>
%90 = vector.extract_strided_slice %84 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%91 = vector.extract %90[0] : vector<1xf32>
%92 = splat %91 : vector<4xf32>
%93 = vector.fma %92, %37, %89 : vector<4xf32>
%94 = vector.extract_strided_slice %84 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%95 = vector.extract %94[0] : vector<1xf32>
%96 = splat %95 : vector<4xf32>
%97 = vector.fma %96, %38, %93 : vector<4xf32>
%98 = vector.shape_cast %97 : vector<4xf32> to vector<1x4xf32>
scf.yield %53, %68, %83, %98 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%29 = vector.shape_cast %28#3 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %29, %22[%c0, %c0, %c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%30 = vector.shape_cast %28#2 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %30, %22[%c0, %c0, %c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%31 = vector.shape_cast %28#1 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %31, %22[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = vector.shape_cast %28#0 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %32, %22[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
// -----// IR Dump After SPIRVLowerExecutableTarget //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module {
func @conv_dispatch_0() {
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = muli %8, %c4 : index
%11 = muli %7, %c4 : index
%12 = muli %9, %c2 : index
%13 = muli %8, %c8 : index
%14 = memref.subview %6[0, 0, 0, %11] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<4xf32>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%19 = muli %arg0, %c2 : index
%20 = memref.subview %0[0, %19, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%21 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%22 = memref.subview %21[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %20[0, %12, %13, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%24 = vector.shape_cast %15 : vector<4xf32> to vector<1x4xf32>
%25 = vector.shape_cast %16 : vector<4xf32> to vector<1x4xf32>
%26 = vector.shape_cast %17 : vector<4xf32> to vector<1x4xf32>
%27 = vector.shape_cast %18 : vector<4xf32> to vector<1x4xf32>
%28:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %24, %arg3 = %25, %arg4 = %26, %arg5 = %27) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%33:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%34 = memref.subview %23[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%35 = memref.subview %14[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%40 = vector.extract_strided_slice %39 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%41 = vector.extract %40[0] : vector<1xf32>
%42 = splat %41 : vector<4xf32>
%43 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%44 = vector.fma %42, %36, %43 : vector<4xf32>
%45 = vector.extract_strided_slice %39 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%46 = vector.extract %45[0] : vector<1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %37, %44 : vector<4xf32>
%49 = vector.extract_strided_slice %39 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%50 = vector.extract %49[0] : vector<1xf32>
%51 = splat %50 : vector<4xf32>
%52 = vector.fma %51, %38, %48 : vector<4xf32>
%53 = vector.shape_cast %52 : vector<4xf32> to vector<1x4xf32>
%54 = vector.transfer_read %34[%c0, %c0, %c2, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%55 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%56 = vector.extract %55[0] : vector<1xf32>
%57 = splat %56 : vector<4xf32>
%58 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%59 = vector.fma %57, %36, %58 : vector<4xf32>
%60 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%61 = vector.extract %60[0] : vector<1xf32>
%62 = splat %61 : vector<4xf32>
%63 = vector.fma %62, %37, %59 : vector<4xf32>
%64 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%65 = vector.extract %64[0] : vector<1xf32>
%66 = splat %65 : vector<4xf32>
%67 = vector.fma %66, %38, %63 : vector<4xf32>
%68 = vector.shape_cast %67 : vector<4xf32> to vector<1x4xf32>
%69 = vector.transfer_read %34[%c0, %c0, %c4, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract %70[0] : vector<1xf32>
%72 = splat %71 : vector<4xf32>
%73 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%74 = vector.fma %72, %36, %73 : vector<4xf32>
%75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%76 = vector.extract %75[0] : vector<1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.fma %77, %37, %74 : vector<4xf32>
%79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%80 = vector.extract %79[0] : vector<1xf32>
%81 = splat %80 : vector<4xf32>
%82 = vector.fma %81, %38, %78 : vector<4xf32>
%83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
%84 = vector.transfer_read %34[%c0, %c0, %c6, %c0], %cst {in_bounds = [true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%85 = vector.extract_strided_slice %84 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%86 = vector.extract %85[0] : vector<1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%89 = vector.fma %87, %36, %88 : vector<4xf32>
%90 = vector.extract_strided_slice %84 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%91 = vector.extract %90[0] : vector<1xf32>
%92 = splat %91 : vector<4xf32>
%93 = vector.fma %92, %37, %89 : vector<4xf32>
%94 = vector.extract_strided_slice %84 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%95 = vector.extract %94[0] : vector<1xf32>
%96 = splat %95 : vector<4xf32>
%97 = vector.fma %96, %38, %93 : vector<4xf32>
%98 = vector.shape_cast %97 : vector<4xf32> to vector<1x4xf32>
scf.yield %53, %68, %83, %98 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%29 = vector.shape_cast %28#3 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %29, %22[%c0, %c0, %c3, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%30 = vector.shape_cast %28#2 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %30, %22[%c0, %c0, %c2, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%31 = vector.shape_cast %28#1 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %31, %22[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = vector.shape_cast %28#0 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %32, %22[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After FoldSubViewOps //----- //
module {
func @conv_dispatch_0() {
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%18:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %14, %arg3 = %15, %arg4 = %16, %arg5 = %17) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%47:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%48 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%49 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%50 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%51 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%50)[%4]
%52 = vector.transfer_read %1[%48, %49, %c0, %51], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%53 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%54 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%55 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%56 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%55)[%4]
%57 = vector.transfer_read %1[%53, %54, %c1, %56], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%58 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%59 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%60 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%61 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%60)[%4]
%62 = vector.transfer_read %1[%58, %59, %c2, %61], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%63 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%64 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%65 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%63)[%11]
%66 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%64)[%12]
%67 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%65)[%13]
%68 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%66)[%5]
%69 = vector.transfer_read %0[%c0, %67, %68, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract %70[0] : vector<1xf32>
%72 = splat %71 : vector<4xf32>
%73 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%74 = vector.fma %72, %52, %73 : vector<4xf32>
%75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%76 = vector.extract %75[0] : vector<1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.fma %77, %57, %74 : vector<4xf32>
%79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%80 = vector.extract %79[0] : vector<1xf32>
%81 = splat %80 : vector<4xf32>
%82 = vector.fma %81, %62, %78 : vector<4xf32>
%83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
%84 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%85 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%arg6]
%86 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%84)[%11]
%87 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%85)[%12]
%88 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%86)[%13]
%89 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%87)[%5]
%90 = vector.transfer_read %0[%c0, %88, %89, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%91 = vector.extract_strided_slice %90 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%92 = vector.extract %91[0] : vector<1xf32>
%93 = splat %92 : vector<4xf32>
%94 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%95 = vector.fma %93, %52, %94 : vector<4xf32>
%96 = vector.extract_strided_slice %90 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%97 = vector.extract %96[0] : vector<1xf32>
%98 = splat %97 : vector<4xf32>
%99 = vector.fma %98, %57, %95 : vector<4xf32>
%100 = vector.extract_strided_slice %90 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%101 = vector.extract %100[0] : vector<1xf32>
%102 = splat %101 : vector<4xf32>
%103 = vector.fma %102, %62, %99 : vector<4xf32>
%104 = vector.shape_cast %103 : vector<4xf32> to vector<1x4xf32>
%105 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%106 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c4)[%arg6]
%107 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%105)[%11]
%108 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%106)[%12]
%109 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%107)[%13]
%110 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%108)[%5]
%111 = vector.transfer_read %0[%c0, %109, %110, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%112 = vector.extract_strided_slice %111 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%113 = vector.extract %112[0] : vector<1xf32>
%114 = splat %113 : vector<4xf32>
%115 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%116 = vector.fma %114, %52, %115 : vector<4xf32>
%117 = vector.extract_strided_slice %111 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %57, %116 : vector<4xf32>
%121 = vector.extract_strided_slice %111 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%122 = vector.extract %121[0] : vector<1xf32>
%123 = splat %122 : vector<4xf32>
%124 = vector.fma %123, %62, %120 : vector<4xf32>
%125 = vector.shape_cast %124 : vector<4xf32> to vector<1x4xf32>
%126 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%127 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c6)[%arg6]
%128 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%126)[%11]
%129 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%127)[%12]
%130 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%128)[%13]
%131 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%129)[%5]
%132 = vector.transfer_read %0[%c0, %130, %131, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%133 = vector.extract_strided_slice %132 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%134 = vector.extract %133[0] : vector<1xf32>
%135 = splat %134 : vector<4xf32>
%136 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%137 = vector.fma %135, %52, %136 : vector<4xf32>
%138 = vector.extract_strided_slice %132 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%139 = vector.extract %138[0] : vector<1xf32>
%140 = splat %139 : vector<4xf32>
%141 = vector.fma %140, %57, %137 : vector<4xf32>
%142 = vector.extract_strided_slice %132 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%143 = vector.extract %142[0] : vector<1xf32>
%144 = splat %143 : vector<4xf32>
%145 = vector.fma %144, %62, %141 : vector<4xf32>
%146 = vector.shape_cast %145 : vector<4xf32> to vector<1x4xf32>
scf.yield %83, %104, %125, %146 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %47#0, %47#1, %47#2, %47#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%19 = vector.shape_cast %18#3 : vector<1x4xf32> to vector<4xf32>
%20 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%21 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c3)[%9]
%22 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%23 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%20)[%arg0]
%24 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%21)[%3]
%25 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%22)[%4]
vector.transfer_write %19, %2[%c0, %23, %24, %25] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%26 = vector.shape_cast %18#2 : vector<1x4xf32> to vector<4xf32>
%27 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%28 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%9]
%29 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%30 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%27)[%arg0]
%31 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%28)[%3]
%32 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%29)[%4]
vector.transfer_write %26, %2[%c0, %30, %31, %32] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%33 = vector.shape_cast %18#1 : vector<1x4xf32> to vector<4xf32>
%34 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%35 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c1)[%9]
%36 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%37 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%34)[%arg0]
%38 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%35)[%3]
%39 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%36)[%4]
vector.transfer_write %33, %2[%c0, %37, %38, %39] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%40 = vector.shape_cast %18#0 : vector<1x4xf32> to vector<4xf32>
%41 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%42 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%9]
%43 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%44 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%41)[%arg0]
%45 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%42)[%3]
%46 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%43)[%4]
vector.transfer_write %40, %2[%c0, %44, %45, %46] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::FoldDimOverShapeCarryingOpPass //----- //
func @conv_dispatch_0() {
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%15 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%16 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%17 = vector.shape_cast %cst_0 : vector<1x1x1x4xf32> to vector<1x4xf32>
%18:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %14, %arg3 = %15, %arg4 = %16, %arg5 = %17) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%47:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%48 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%49 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%50 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%51 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%50)[%4]
%52 = vector.transfer_read %1[%48, %49, %c0, %51], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%53 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%54 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%55 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%56 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%55)[%4]
%57 = vector.transfer_read %1[%53, %54, %c1, %56], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%58 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%59 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%60 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%61 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%60)[%4]
%62 = vector.transfer_read %1[%58, %59, %c2, %61], %cst {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%63 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%64 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg6]
%65 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%63)[%11]
%66 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%64)[%12]
%67 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%65)[%13]
%68 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%66)[%5]
%69 = vector.transfer_read %0[%c0, %67, %68, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%70 = vector.extract_strided_slice %69 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract %70[0] : vector<1xf32>
%72 = splat %71 : vector<4xf32>
%73 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%74 = vector.fma %72, %52, %73 : vector<4xf32>
%75 = vector.extract_strided_slice %69 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%76 = vector.extract %75[0] : vector<1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.fma %77, %57, %74 : vector<4xf32>
%79 = vector.extract_strided_slice %69 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%80 = vector.extract %79[0] : vector<1xf32>
%81 = splat %80 : vector<4xf32>
%82 = vector.fma %81, %62, %78 : vector<4xf32>
%83 = vector.shape_cast %82 : vector<4xf32> to vector<1x4xf32>
%84 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%85 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%arg6]
%86 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%84)[%11]
%87 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%85)[%12]
%88 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%86)[%13]
%89 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%87)[%5]
%90 = vector.transfer_read %0[%c0, %88, %89, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%91 = vector.extract_strided_slice %90 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%92 = vector.extract %91[0] : vector<1xf32>
%93 = splat %92 : vector<4xf32>
%94 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%95 = vector.fma %93, %52, %94 : vector<4xf32>
%96 = vector.extract_strided_slice %90 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%97 = vector.extract %96[0] : vector<1xf32>
%98 = splat %97 : vector<4xf32>
%99 = vector.fma %98, %57, %95 : vector<4xf32>
%100 = vector.extract_strided_slice %90 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%101 = vector.extract %100[0] : vector<1xf32>
%102 = splat %101 : vector<4xf32>
%103 = vector.fma %102, %62, %99 : vector<4xf32>
%104 = vector.shape_cast %103 : vector<4xf32> to vector<1x4xf32>
%105 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%106 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c4)[%arg6]
%107 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%105)[%11]
%108 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%106)[%12]
%109 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%107)[%13]
%110 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%108)[%5]
%111 = vector.transfer_read %0[%c0, %109, %110, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%112 = vector.extract_strided_slice %111 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%113 = vector.extract %112[0] : vector<1xf32>
%114 = splat %113 : vector<4xf32>
%115 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%116 = vector.fma %114, %52, %115 : vector<4xf32>
%117 = vector.extract_strided_slice %111 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %57, %116 : vector<4xf32>
%121 = vector.extract_strided_slice %111 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%122 = vector.extract %121[0] : vector<1xf32>
%123 = splat %122 : vector<4xf32>
%124 = vector.fma %123, %62, %120 : vector<4xf32>
%125 = vector.shape_cast %124 : vector<4xf32> to vector<1x4xf32>
%126 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%arg1]
%127 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c6)[%arg6]
%128 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%126)[%11]
%129 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%127)[%12]
%130 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%128)[%13]
%131 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%129)[%5]
%132 = vector.transfer_read %0[%c0, %130, %131, %c0], %cst {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%133 = vector.extract_strided_slice %132 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%134 = vector.extract %133[0] : vector<1xf32>
%135 = splat %134 : vector<4xf32>
%136 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%137 = vector.fma %135, %52, %136 : vector<4xf32>
%138 = vector.extract_strided_slice %132 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%139 = vector.extract %138[0] : vector<1xf32>
%140 = splat %139 : vector<4xf32>
%141 = vector.fma %140, %57, %137 : vector<4xf32>
%142 = vector.extract_strided_slice %132 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%143 = vector.extract %142[0] : vector<1xf32>
%144 = splat %143 : vector<4xf32>
%145 = vector.fma %144, %62, %141 : vector<4xf32>
%146 = vector.shape_cast %145 : vector<4xf32> to vector<1x4xf32>
scf.yield %83, %104, %125, %146 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %47#0, %47#1, %47#2, %47#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%19 = vector.shape_cast %18#3 : vector<1x4xf32> to vector<4xf32>
%20 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%21 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c3)[%9]
%22 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%23 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%20)[%arg0]
%24 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%21)[%3]
%25 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%22)[%4]
vector.transfer_write %19, %2[%c0, %23, %24, %25] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%26 = vector.shape_cast %18#2 : vector<1x4xf32> to vector<4xf32>
%27 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%28 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c2)[%9]
%29 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%30 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%27)[%arg0]
%31 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%28)[%3]
%32 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%29)[%4]
vector.transfer_write %26, %2[%c0, %30, %31, %32] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%33 = vector.shape_cast %18#1 : vector<1x4xf32> to vector<4xf32>
%34 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%35 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c1)[%9]
%36 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%37 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%34)[%arg0]
%38 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%35)[%3]
%39 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%36)[%4]
vector.transfer_write %33, %2[%c0, %37, %38, %39] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%40 = vector.shape_cast %18#0 : vector<1x4xf32> to vector<4xf32>
%41 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%8]
%42 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%9]
%43 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c0)[%10]
%44 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%41)[%arg0]
%45 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%42)[%3]
%46 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%43)[%4]
vector.transfer_write %40, %2[%c0, %44, %45, %46] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%33 = vector.transfer_read %1[%arg1, %arg6, %c0, %32], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%34 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%35 = vector.transfer_read %1[%arg1, %arg6, %c1, %34], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%36 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%37 = vector.transfer_read %1[%arg1, %arg6, %c2, %36], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%39 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
%40 = vector.transfer_read %0[%c0, %38, %39, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%41 = vector.extract_strided_slice %40 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%42 = vector.extract %41[0] : vector<1xf32>
%43 = splat %42 : vector<4xf32>
%44 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%45 = vector.fma %43, %33, %44 : vector<4xf32>
%46 = vector.extract_strided_slice %40 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%47 = vector.extract %46[0] : vector<1xf32>
%48 = splat %47 : vector<4xf32>
%49 = vector.fma %48, %35, %45 : vector<4xf32>
%50 = vector.extract_strided_slice %40 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%51 = vector.extract %50[0] : vector<1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %37, %49 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%56 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
%57 = vector.transfer_read %0[%c0, %55, %56, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%58 = vector.extract_strided_slice %57 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%59 = vector.extract %58[0] : vector<1xf32>
%60 = splat %59 : vector<4xf32>
%61 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%62 = vector.fma %60, %33, %61 : vector<4xf32>
%63 = vector.extract_strided_slice %57 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%64 = vector.extract %63[0] : vector<1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %35, %62 : vector<4xf32>
%67 = vector.extract_strided_slice %57 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%68 = vector.extract %67[0] : vector<1xf32>
%69 = splat %68 : vector<4xf32>
%70 = vector.fma %69, %37, %66 : vector<4xf32>
%71 = vector.shape_cast %70 : vector<4xf32> to vector<1x4xf32>
%72 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%73 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
%74 = vector.transfer_read %0[%c0, %72, %73, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%75 = vector.extract_strided_slice %74 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%76 = vector.extract %75[0] : vector<1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %33, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %74 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%81 = vector.extract %80[0] : vector<1xf32>
%82 = splat %81 : vector<4xf32>
%83 = vector.fma %82, %35, %79 : vector<4xf32>
%84 = vector.extract_strided_slice %74 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%85 = vector.extract %84[0] : vector<1xf32>
%86 = splat %85 : vector<4xf32>
%87 = vector.fma %86, %37, %83 : vector<4xf32>
%88 = vector.shape_cast %87 : vector<4xf32> to vector<1x4xf32>
%89 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%90 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
%91 = vector.transfer_read %0[%c0, %89, %90, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%93 = vector.extract %92[0] : vector<1xf32>
%94 = splat %93 : vector<4xf32>
%95 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%96 = vector.fma %94, %33, %95 : vector<4xf32>
%97 = vector.extract_strided_slice %91 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%98 = vector.extract %97[0] : vector<1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.fma %99, %35, %96 : vector<4xf32>
%101 = vector.extract_strided_slice %91 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%102 = vector.extract %101[0] : vector<1xf32>
%103 = splat %102 : vector<4xf32>
%104 = vector.fma %103, %37, %100 : vector<4xf32>
%105 = vector.shape_cast %104 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %71, %88, %105 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
vector.transfer_write %15, %2[%c0, %16, %17, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%19 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
vector.transfer_write %19, %2[%c0, %20, %21, %22] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%23 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
vector.transfer_write %23, %2[%c0, %24, %25, %26] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%27 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
%28 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%29 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
vector.transfer_write %27, %2[%c0, %28, %29, %30] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%25:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%27 = vector.transfer_read %1[%arg1, %arg6, %c0, %26], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%28 = vector.transfer_read %1[%arg1, %arg6, %c1, %26], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%29 = vector.transfer_read %1[%arg1, %arg6, %c2, %26], %cst_0 {in_bounds = [true]} : memref<3x3x3x32xf32>, vector<4xf32>
%30 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%31 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
%32 = vector.transfer_read %0[%c0, %30, %31, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%33 = vector.extract_strided_slice %32 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%34 = vector.extract %33[0] : vector<1xf32>
%35 = splat %34 : vector<4xf32>
%36 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%37 = vector.fma %35, %27, %36 : vector<4xf32>
%38 = vector.extract_strided_slice %32 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%39 = vector.extract %38[0] : vector<1xf32>
%40 = splat %39 : vector<4xf32>
%41 = vector.fma %40, %28, %37 : vector<4xf32>
%42 = vector.extract_strided_slice %32 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%43 = vector.extract %42[0] : vector<1xf32>
%44 = splat %43 : vector<4xf32>
%45 = vector.fma %44, %29, %41 : vector<4xf32>
%46 = vector.shape_cast %45 : vector<4xf32> to vector<1x4xf32>
%47 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
%48 = vector.transfer_read %0[%c0, %30, %47, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%49 = vector.extract_strided_slice %48 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%50 = vector.extract %49[0] : vector<1xf32>
%51 = splat %50 : vector<4xf32>
%52 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%53 = vector.fma %51, %27, %52 : vector<4xf32>
%54 = vector.extract_strided_slice %48 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%55 = vector.extract %54[0] : vector<1xf32>
%56 = splat %55 : vector<4xf32>
%57 = vector.fma %56, %28, %53 : vector<4xf32>
%58 = vector.extract_strided_slice %48 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%59 = vector.extract %58[0] : vector<1xf32>
%60 = splat %59 : vector<4xf32>
%61 = vector.fma %60, %29, %57 : vector<4xf32>
%62 = vector.shape_cast %61 : vector<4xf32> to vector<1x4xf32>
%63 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
%64 = vector.transfer_read %0[%c0, %30, %63, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%65 = vector.extract_strided_slice %64 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%66 = vector.extract %65[0] : vector<1xf32>
%67 = splat %66 : vector<4xf32>
%68 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%69 = vector.fma %67, %27, %68 : vector<4xf32>
%70 = vector.extract_strided_slice %64 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract %70[0] : vector<1xf32>
%72 = splat %71 : vector<4xf32>
%73 = vector.fma %72, %28, %69 : vector<4xf32>
%74 = vector.extract_strided_slice %64 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%75 = vector.extract %74[0] : vector<1xf32>
%76 = splat %75 : vector<4xf32>
%77 = vector.fma %76, %29, %73 : vector<4xf32>
%78 = vector.shape_cast %77 : vector<4xf32> to vector<1x4xf32>
%79 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
%80 = vector.transfer_read %0[%c0, %30, %79, %c0], %cst_0 {in_bounds = [true]} : memref<1x225x225x3xf32>, vector<3xf32>
%81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%82 = vector.extract %81[0] : vector<1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%85 = vector.fma %83, %27, %84 : vector<4xf32>
%86 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%87 = vector.extract %86[0] : vector<1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %28, %85 : vector<4xf32>
%90 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%91 = vector.extract %90[0] : vector<1xf32>
%92 = splat %91 : vector<4xf32>
%93 = vector.fma %92, %29, %89 : vector<4xf32>
%94 = vector.shape_cast %93 : vector<4xf32> to vector<1x4xf32>
scf.yield %46, %62, %78, %94 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %25#0, %25#1, %25#2, %25#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
vector.transfer_write %15, %2[%c0, %16, %17, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%19 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
vector.transfer_write %19, %2[%c0, %16, %20, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%21 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
vector.transfer_write %21, %2[%c0, %16, %22, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
%23 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
vector.transfer_write %23, %2[%c0, %16, %24, %18] {in_bounds = [true]} : vector<4xf32>, memref<1x112x112x32xf32>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After SPIRVVectorizeLoadStore //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%cst_0 = constant dense<0.000000e+00> : vector<3xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%29:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%31 = divi_signed %30, %c4 : index
%32 = memref.load %1[%arg1, %arg6, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
%33 = divi_signed %30, %c4 : index
%34 = memref.load %1[%arg1, %arg6, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
%35 = divi_signed %30, %c4 : index
%36 = memref.load %1[%arg1, %arg6, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
%37 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
%39 = memref.load %0[%c0, %37, %38, %c0] : memref<1x225x225x3xf32>
%40 = vector.insert %39, %cst_0 [0] : f32 into vector<3xf32>
%41 = memref.load %0[%c0, %37, %38, %c1] : memref<1x225x225x3xf32>
%42 = vector.insert %41, %40 [1] : f32 into vector<3xf32>
%43 = memref.load %0[%c0, %37, %38, %c2] : memref<1x225x225x3xf32>
%44 = vector.insert %43, %42 [2] : f32 into vector<3xf32>
%45 = vector.extract_strided_slice %44 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%46 = vector.extract %45[0] : vector<1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%49 = vector.fma %47, %32, %48 : vector<4xf32>
%50 = vector.extract_strided_slice %44 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%51 = vector.extract %50[0] : vector<1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %34, %49 : vector<4xf32>
%54 = vector.extract_strided_slice %44 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%55 = vector.extract %54[0] : vector<1xf32>
%56 = splat %55 : vector<4xf32>
%57 = vector.fma %56, %36, %53 : vector<4xf32>
%58 = vector.shape_cast %57 : vector<4xf32> to vector<1x4xf32>
%59 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
%60 = memref.load %0[%c0, %37, %59, %c0] : memref<1x225x225x3xf32>
%61 = vector.insert %60, %cst_0 [0] : f32 into vector<3xf32>
%62 = memref.load %0[%c0, %37, %59, %c1] : memref<1x225x225x3xf32>
%63 = vector.insert %62, %61 [1] : f32 into vector<3xf32>
%64 = memref.load %0[%c0, %37, %59, %c2] : memref<1x225x225x3xf32>
%65 = vector.insert %64, %63 [2] : f32 into vector<3xf32>
%66 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%67 = vector.extract %66[0] : vector<1xf32>
%68 = splat %67 : vector<4xf32>
%69 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%70 = vector.fma %68, %32, %69 : vector<4xf32>
%71 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%72 = vector.extract %71[0] : vector<1xf32>
%73 = splat %72 : vector<4xf32>
%74 = vector.fma %73, %34, %70 : vector<4xf32>
%75 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%76 = vector.extract %75[0] : vector<1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.fma %77, %36, %74 : vector<4xf32>
%79 = vector.shape_cast %78 : vector<4xf32> to vector<1x4xf32>
%80 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
%81 = memref.load %0[%c0, %37, %80, %c0] : memref<1x225x225x3xf32>
%82 = vector.insert %81, %cst_0 [0] : f32 into vector<3xf32>
%83 = memref.load %0[%c0, %37, %80, %c1] : memref<1x225x225x3xf32>
%84 = vector.insert %83, %82 [1] : f32 into vector<3xf32>
%85 = memref.load %0[%c0, %37, %80, %c2] : memref<1x225x225x3xf32>
%86 = vector.insert %85, %84 [2] : f32 into vector<3xf32>
%87 = vector.extract_strided_slice %86 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%88 = vector.extract %87[0] : vector<1xf32>
%89 = splat %88 : vector<4xf32>
%90 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%91 = vector.fma %89, %32, %90 : vector<4xf32>
%92 = vector.extract_strided_slice %86 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%93 = vector.extract %92[0] : vector<1xf32>
%94 = splat %93 : vector<4xf32>
%95 = vector.fma %94, %34, %91 : vector<4xf32>
%96 = vector.extract_strided_slice %86 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%97 = vector.extract %96[0] : vector<1xf32>
%98 = splat %97 : vector<4xf32>
%99 = vector.fma %98, %36, %95 : vector<4xf32>
%100 = vector.shape_cast %99 : vector<4xf32> to vector<1x4xf32>
%101 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
%102 = memref.load %0[%c0, %37, %101, %c0] : memref<1x225x225x3xf32>
%103 = vector.insert %102, %cst_0 [0] : f32 into vector<3xf32>
%104 = memref.load %0[%c0, %37, %101, %c1] : memref<1x225x225x3xf32>
%105 = vector.insert %104, %103 [1] : f32 into vector<3xf32>
%106 = memref.load %0[%c0, %37, %101, %c2] : memref<1x225x225x3xf32>
%107 = vector.insert %106, %105 [2] : f32 into vector<3xf32>
%108 = vector.extract_strided_slice %107 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%109 = vector.extract %108[0] : vector<1xf32>
%110 = splat %109 : vector<4xf32>
%111 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%112 = vector.fma %110, %32, %111 : vector<4xf32>
%113 = vector.extract_strided_slice %107 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%114 = vector.extract %113[0] : vector<1xf32>
%115 = splat %114 : vector<4xf32>
%116 = vector.fma %115, %34, %112 : vector<4xf32>
%117 = vector.extract_strided_slice %107 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %36, %116 : vector<4xf32>
%121 = vector.shape_cast %120 : vector<4xf32> to vector<1x4xf32>
scf.yield %58, %79, %100, %121 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %29#0, %29#1, %29#2, %29#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%19 = divi_signed %18, %c4 : index
memref.store %15, %2[%c0, %16, %17, %19] : memref<1x112x112x8xvector<4xf32>>
%20 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
%22 = divi_signed %18, %c4 : index
memref.store %20, %2[%c0, %16, %21, %22] : memref<1x112x112x8xvector<4xf32>>
%23 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
%25 = divi_signed %18, %c4 : index
memref.store %23, %2[%c0, %16, %24, %25] : memref<1x112x112x8xvector<4xf32>>
%26 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
%27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
%28 = divi_signed %18, %c4 : index
memref.store %26, %2[%c0, %16, %27, %28] : memref<1x112x112x8xvector<4xf32>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After SPIRVVectorToCooperativeMatrix //----- //
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%cst_0 = constant dense<0.000000e+00> : vector<3xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%29:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%31 = divi_signed %30, %c4 : index
%32 = memref.load %1[%arg1, %arg6, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
%33 = divi_signed %30, %c4 : index
%34 = memref.load %1[%arg1, %arg6, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
%35 = divi_signed %30, %c4 : index
%36 = memref.load %1[%arg1, %arg6, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
%37 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
%39 = memref.load %0[%c0, %37, %38, %c0] : memref<1x225x225x3xf32>
%40 = vector.insert %39, %cst_0 [0] : f32 into vector<3xf32>
%41 = memref.load %0[%c0, %37, %38, %c1] : memref<1x225x225x3xf32>
%42 = vector.insert %41, %40 [1] : f32 into vector<3xf32>
%43 = memref.load %0[%c0, %37, %38, %c2] : memref<1x225x225x3xf32>
%44 = vector.insert %43, %42 [2] : f32 into vector<3xf32>
%45 = vector.extract_strided_slice %44 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%46 = vector.extract %45[0] : vector<1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%49 = vector.fma %47, %32, %48 : vector<4xf32>
%50 = vector.extract_strided_slice %44 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%51 = vector.extract %50[0] : vector<1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %34, %49 : vector<4xf32>
%54 = vector.extract_strided_slice %44 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%55 = vector.extract %54[0] : vector<1xf32>
%56 = splat %55 : vector<4xf32>
%57 = vector.fma %56, %36, %53 : vector<4xf32>
%58 = vector.shape_cast %57 : vector<4xf32> to vector<1x4xf32>
%59 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
%60 = memref.load %0[%c0, %37, %59, %c0] : memref<1x225x225x3xf32>
%61 = vector.insert %60, %cst_0 [0] : f32 into vector<3xf32>
%62 = memref.load %0[%c0, %37, %59, %c1] : memref<1x225x225x3xf32>
%63 = vector.insert %62, %61 [1] : f32 into vector<3xf32>
%64 = memref.load %0[%c0, %37, %59, %c2] : memref<1x225x225x3xf32>
%65 = vector.insert %64, %63 [2] : f32 into vector<3xf32>
%66 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%67 = vector.extract %66[0] : vector<1xf32>
%68 = splat %67 : vector<4xf32>
%69 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%70 = vector.fma %68, %32, %69 : vector<4xf32>
%71 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%72 = vector.extract %71[0] : vector<1xf32>
%73 = splat %72 : vector<4xf32>
%74 = vector.fma %73, %34, %70 : vector<4xf32>
%75 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%76 = vector.extract %75[0] : vector<1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.fma %77, %36, %74 : vector<4xf32>
%79 = vector.shape_cast %78 : vector<4xf32> to vector<1x4xf32>
%80 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
%81 = memref.load %0[%c0, %37, %80, %c0] : memref<1x225x225x3xf32>
%82 = vector.insert %81, %cst_0 [0] : f32 into vector<3xf32>
%83 = memref.load %0[%c0, %37, %80, %c1] : memref<1x225x225x3xf32>
%84 = vector.insert %83, %82 [1] : f32 into vector<3xf32>
%85 = memref.load %0[%c0, %37, %80, %c2] : memref<1x225x225x3xf32>
%86 = vector.insert %85, %84 [2] : f32 into vector<3xf32>
%87 = vector.extract_strided_slice %86 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%88 = vector.extract %87[0] : vector<1xf32>
%89 = splat %88 : vector<4xf32>
%90 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%91 = vector.fma %89, %32, %90 : vector<4xf32>
%92 = vector.extract_strided_slice %86 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%93 = vector.extract %92[0] : vector<1xf32>
%94 = splat %93 : vector<4xf32>
%95 = vector.fma %94, %34, %91 : vector<4xf32>
%96 = vector.extract_strided_slice %86 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%97 = vector.extract %96[0] : vector<1xf32>
%98 = splat %97 : vector<4xf32>
%99 = vector.fma %98, %36, %95 : vector<4xf32>
%100 = vector.shape_cast %99 : vector<4xf32> to vector<1x4xf32>
%101 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
%102 = memref.load %0[%c0, %37, %101, %c0] : memref<1x225x225x3xf32>
%103 = vector.insert %102, %cst_0 [0] : f32 into vector<3xf32>
%104 = memref.load %0[%c0, %37, %101, %c1] : memref<1x225x225x3xf32>
%105 = vector.insert %104, %103 [1] : f32 into vector<3xf32>
%106 = memref.load %0[%c0, %37, %101, %c2] : memref<1x225x225x3xf32>
%107 = vector.insert %106, %105 [2] : f32 into vector<3xf32>
%108 = vector.extract_strided_slice %107 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%109 = vector.extract %108[0] : vector<1xf32>
%110 = splat %109 : vector<4xf32>
%111 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%112 = vector.fma %110, %32, %111 : vector<4xf32>
%113 = vector.extract_strided_slice %107 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%114 = vector.extract %113[0] : vector<1xf32>
%115 = splat %114 : vector<4xf32>
%116 = vector.fma %115, %34, %112 : vector<4xf32>
%117 = vector.extract_strided_slice %107 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %36, %116 : vector<4xf32>
%121 = vector.shape_cast %120 : vector<4xf32> to vector<1x4xf32>
scf.yield %58, %79, %100, %121 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %29#0, %29#1, %29#2, %29#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%15 = vector.shape_cast %14#3 : vector<1x4xf32> to vector<4xf32>
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
%18 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%19 = divi_signed %18, %c4 : index
memref.store %15, %2[%c0, %16, %17, %19] : memref<1x112x112x8xvector<4xf32>>
%20 = vector.shape_cast %14#2 : vector<1x4xf32> to vector<4xf32>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
%22 = divi_signed %18, %c4 : index
memref.store %20, %2[%c0, %16, %21, %22] : memref<1x112x112x8xvector<4xf32>>
%23 = vector.shape_cast %14#1 : vector<1x4xf32> to vector<4xf32>
%24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
%25 = divi_signed %18, %c4 : index
memref.store %23, %2[%c0, %16, %24, %25] : memref<1x112x112x8xvector<4xf32>>
%26 = vector.shape_cast %14#0 : vector<1x4xf32> to vector<4xf32>
%27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
%28 = divi_signed %18, %c4 : index
memref.store %26, %2[%c0, %16, %27, %28] : memref<1x112x112x8xvector<4xf32>>
}
return
}
// -----// IR Dump After ForOpCanonicalization //----- //
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%cst_0 = constant dense<0.000000e+00> : vector<3xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%18:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %14, %arg3 = %15, %arg4 = %16, %arg5 = %17) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%29:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%31 = divi_signed %30, %c4 : index
%32 = memref.load %1[%arg1, %arg6, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
%33 = divi_signed %30, %c4 : index
%34 = memref.load %1[%arg1, %arg6, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
%35 = divi_signed %30, %c4 : index
%36 = memref.load %1[%arg1, %arg6, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
%37 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%38 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
%39 = memref.load %0[%c0, %37, %38, %c0] : memref<1x225x225x3xf32>
%40 = vector.insert %39, %cst_0 [0] : f32 into vector<3xf32>
%41 = memref.load %0[%c0, %37, %38, %c1] : memref<1x225x225x3xf32>
%42 = vector.insert %41, %40 [1] : f32 into vector<3xf32>
%43 = memref.load %0[%c0, %37, %38, %c2] : memref<1x225x225x3xf32>
%44 = vector.insert %43, %42 [2] : f32 into vector<3xf32>
%45 = vector.extract_strided_slice %44 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%46 = vector.extract %45[0] : vector<1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %32, %arg7 : vector<4xf32>
%49 = vector.extract_strided_slice %44 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%50 = vector.extract %49[0] : vector<1xf32>
%51 = splat %50 : vector<4xf32>
%52 = vector.fma %51, %34, %48 : vector<4xf32>
%53 = vector.extract_strided_slice %44 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%54 = vector.extract %53[0] : vector<1xf32>
%55 = splat %54 : vector<4xf32>
%56 = vector.fma %55, %36, %52 : vector<4xf32>
%57 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
%58 = memref.load %0[%c0, %37, %57, %c0] : memref<1x225x225x3xf32>
%59 = vector.insert %58, %cst_0 [0] : f32 into vector<3xf32>
%60 = memref.load %0[%c0, %37, %57, %c1] : memref<1x225x225x3xf32>
%61 = vector.insert %60, %59 [1] : f32 into vector<3xf32>
%62 = memref.load %0[%c0, %37, %57, %c2] : memref<1x225x225x3xf32>
%63 = vector.insert %62, %61 [2] : f32 into vector<3xf32>
%64 = vector.extract_strided_slice %63 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%65 = vector.extract %64[0] : vector<1xf32>
%66 = splat %65 : vector<4xf32>
%67 = vector.fma %66, %32, %arg8 : vector<4xf32>
%68 = vector.extract_strided_slice %63 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract %68[0] : vector<1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %34, %67 : vector<4xf32>
%72 = vector.extract_strided_slice %63 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%73 = vector.extract %72[0] : vector<1xf32>
%74 = splat %73 : vector<4xf32>
%75 = vector.fma %74, %36, %71 : vector<4xf32>
%76 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
%77 = memref.load %0[%c0, %37, %76, %c0] : memref<1x225x225x3xf32>
%78 = vector.insert %77, %cst_0 [0] : f32 into vector<3xf32>
%79 = memref.load %0[%c0, %37, %76, %c1] : memref<1x225x225x3xf32>
%80 = vector.insert %79, %78 [1] : f32 into vector<3xf32>
%81 = memref.load %0[%c0, %37, %76, %c2] : memref<1x225x225x3xf32>
%82 = vector.insert %81, %80 [2] : f32 into vector<3xf32>
%83 = vector.extract_strided_slice %82 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%84 = vector.extract %83[0] : vector<1xf32>
%85 = splat %84 : vector<4xf32>
%86 = vector.fma %85, %32, %arg9 : vector<4xf32>
%87 = vector.extract_strided_slice %82 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%88 = vector.extract %87[0] : vector<1xf32>
%89 = splat %88 : vector<4xf32>
%90 = vector.fma %89, %34, %86 : vector<4xf32>
%91 = vector.extract_strided_slice %82 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%92 = vector.extract %91[0] : vector<1xf32>
%93 = splat %92 : vector<4xf32>
%94 = vector.fma %93, %36, %90 : vector<4xf32>
%95 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
%96 = memref.load %0[%c0, %37, %95, %c0] : memref<1x225x225x3xf32>
%97 = vector.insert %96, %cst_0 [0] : f32 into vector<3xf32>
%98 = memref.load %0[%c0, %37, %95, %c1] : memref<1x225x225x3xf32>
%99 = vector.insert %98, %97 [1] : f32 into vector<3xf32>
%100 = memref.load %0[%c0, %37, %95, %c2] : memref<1x225x225x3xf32>
%101 = vector.insert %100, %99 [2] : f32 into vector<3xf32>
%102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%103 = vector.extract %102[0] : vector<1xf32>
%104 = splat %103 : vector<4xf32>
%105 = vector.fma %104, %32, %arg10 : vector<4xf32>
%106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%107 = vector.extract %106[0] : vector<1xf32>
%108 = splat %107 : vector<4xf32>
%109 = vector.fma %108, %34, %105 : vector<4xf32>
%110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%111 = vector.extract %110[0] : vector<1xf32>
%112 = splat %111 : vector<4xf32>
%113 = vector.fma %112, %36, %109 : vector<4xf32>
scf.yield %56, %75, %94, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %29#0, %29#1, %29#2, %29#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%22 = divi_signed %21, %c4 : index
memref.store %18#3, %2[%c0, %19, %20, %22] : memref<1x112x112x8xvector<4xf32>>
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
%24 = divi_signed %21, %c4 : index
memref.store %18#2, %2[%c0, %19, %23, %24] : memref<1x112x112x8xvector<4xf32>>
%25 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
%26 = divi_signed %21, %c4 : index
memref.store %18#1, %2[%c0, %19, %25, %26] : memref<1x112x112x8xvector<4xf32>>
%27 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
%28 = divi_signed %21, %c4 : index
memref.store %18#0, %2[%c0, %19, %27, %28] : memref<1x112x112x8xvector<4xf32>>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = constant dense<0.000000e+00> : vector<3xf32>
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%25:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%26 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%27 = divi_signed %26, %c4 : index
%28 = memref.load %1[%arg1, %arg6, %c0, %27] : memref<3x3x3x8xvector<4xf32>>
%29 = divi_signed %26, %c4 : index
%30 = memref.load %1[%arg1, %arg6, %c1, %29] : memref<3x3x3x8xvector<4xf32>>
%31 = divi_signed %26, %c4 : index
%32 = memref.load %1[%arg1, %arg6, %c2, %31] : memref<3x3x3x8xvector<4xf32>>
%33 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%34 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
%35 = memref.load %0[%c0, %33, %34, %c0] : memref<1x225x225x3xf32>
%36 = vector.insert %35, %cst_0 [0] : f32 into vector<3xf32>
%37 = memref.load %0[%c0, %33, %34, %c1] : memref<1x225x225x3xf32>
%38 = vector.insert %37, %36 [1] : f32 into vector<3xf32>
%39 = memref.load %0[%c0, %33, %34, %c2] : memref<1x225x225x3xf32>
%40 = vector.insert %39, %38 [2] : f32 into vector<3xf32>
%41 = vector.extract_strided_slice %40 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%42 = vector.extract %41[0] : vector<1xf32>
%43 = splat %42 : vector<4xf32>
%44 = vector.fma %43, %28, %arg7 : vector<4xf32>
%45 = vector.extract_strided_slice %40 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%46 = vector.extract %45[0] : vector<1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %30, %44 : vector<4xf32>
%49 = vector.extract_strided_slice %40 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%50 = vector.extract %49[0] : vector<1xf32>
%51 = splat %50 : vector<4xf32>
%52 = vector.fma %51, %32, %48 : vector<4xf32>
%53 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
%54 = memref.load %0[%c0, %33, %53, %c0] : memref<1x225x225x3xf32>
%55 = vector.insert %54, %cst_0 [0] : f32 into vector<3xf32>
%56 = memref.load %0[%c0, %33, %53, %c1] : memref<1x225x225x3xf32>
%57 = vector.insert %56, %55 [1] : f32 into vector<3xf32>
%58 = memref.load %0[%c0, %33, %53, %c2] : memref<1x225x225x3xf32>
%59 = vector.insert %58, %57 [2] : f32 into vector<3xf32>
%60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%61 = vector.extract %60[0] : vector<1xf32>
%62 = splat %61 : vector<4xf32>
%63 = vector.fma %62, %28, %arg8 : vector<4xf32>
%64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%65 = vector.extract %64[0] : vector<1xf32>
%66 = splat %65 : vector<4xf32>
%67 = vector.fma %66, %30, %63 : vector<4xf32>
%68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract %68[0] : vector<1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %32, %67 : vector<4xf32>
%72 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
%73 = memref.load %0[%c0, %33, %72, %c0] : memref<1x225x225x3xf32>
%74 = vector.insert %73, %cst_0 [0] : f32 into vector<3xf32>
%75 = memref.load %0[%c0, %33, %72, %c1] : memref<1x225x225x3xf32>
%76 = vector.insert %75, %74 [1] : f32 into vector<3xf32>
%77 = memref.load %0[%c0, %33, %72, %c2] : memref<1x225x225x3xf32>
%78 = vector.insert %77, %76 [2] : f32 into vector<3xf32>
%79 = vector.extract_strided_slice %78 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%80 = vector.extract %79[0] : vector<1xf32>
%81 = splat %80 : vector<4xf32>
%82 = vector.fma %81, %28, %arg9 : vector<4xf32>
%83 = vector.extract_strided_slice %78 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%84 = vector.extract %83[0] : vector<1xf32>
%85 = splat %84 : vector<4xf32>
%86 = vector.fma %85, %30, %82 : vector<4xf32>
%87 = vector.extract_strided_slice %78 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%88 = vector.extract %87[0] : vector<1xf32>
%89 = splat %88 : vector<4xf32>
%90 = vector.fma %89, %32, %86 : vector<4xf32>
%91 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
%92 = memref.load %0[%c0, %33, %91, %c0] : memref<1x225x225x3xf32>
%93 = vector.insert %92, %cst_0 [0] : f32 into vector<3xf32>
%94 = memref.load %0[%c0, %33, %91, %c1] : memref<1x225x225x3xf32>
%95 = vector.insert %94, %93 [1] : f32 into vector<3xf32>
%96 = memref.load %0[%c0, %33, %91, %c2] : memref<1x225x225x3xf32>
%97 = vector.insert %96, %95 [2] : f32 into vector<3xf32>
%98 = vector.extract_strided_slice %97 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%99 = vector.extract %98[0] : vector<1xf32>
%100 = splat %99 : vector<4xf32>
%101 = vector.fma %100, %28, %arg10 : vector<4xf32>
%102 = vector.extract_strided_slice %97 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%103 = vector.extract %102[0] : vector<1xf32>
%104 = splat %103 : vector<4xf32>
%105 = vector.fma %104, %30, %101 : vector<4xf32>
%106 = vector.extract_strided_slice %97 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%107 = vector.extract %106[0] : vector<1xf32>
%108 = splat %107 : vector<4xf32>
%109 = vector.fma %108, %32, %105 : vector<4xf32>
scf.yield %52, %71, %90, %109 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %25#0, %25#1, %25#2, %25#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%18 = divi_signed %17, %c4 : index
memref.store %14#3, %2[%c0, %15, %16, %18] : memref<1x112x112x8xvector<4xf32>>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
%20 = divi_signed %17, %c4 : index
memref.store %14#2, %2[%c0, %15, %19, %20] : memref<1x112x112x8xvector<4xf32>>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
%22 = divi_signed %17, %c4 : index
memref.store %14#1, %2[%c0, %15, %21, %22] : memref<1x112x112x8xvector<4xf32>>
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
%24 = divi_signed %17, %c4 : index
memref.store %14#0, %2[%c0, %15, %23, %24] : memref<1x112x112x8xvector<4xf32>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = constant dense<0.000000e+00> : vector<3xf32>
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%22:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%24 = divi_signed %23, %c4 : index
%25 = memref.load %1[%arg1, %arg6, %c0, %24] : memref<3x3x3x8xvector<4xf32>>
%26 = memref.load %1[%arg1, %arg6, %c1, %24] : memref<3x3x3x8xvector<4xf32>>
%27 = memref.load %1[%arg1, %arg6, %c2, %24] : memref<3x3x3x8xvector<4xf32>>
%28 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg1)[%13, %11]
%29 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%arg6)[%5, %12]
%30 = memref.load %0[%c0, %28, %29, %c0] : memref<1x225x225x3xf32>
%31 = vector.insert %30, %cst_0 [0] : f32 into vector<3xf32>
%32 = memref.load %0[%c0, %28, %29, %c1] : memref<1x225x225x3xf32>
%33 = vector.insert %32, %31 [1] : f32 into vector<3xf32>
%34 = memref.load %0[%c0, %28, %29, %c2] : memref<1x225x225x3xf32>
%35 = vector.insert %34, %33 [2] : f32 into vector<3xf32>
%36 = vector.extract_strided_slice %35 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%37 = vector.extract %36[0] : vector<1xf32>
%38 = splat %37 : vector<4xf32>
%39 = vector.fma %38, %25, %arg7 : vector<4xf32>
%40 = vector.extract_strided_slice %35 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%41 = vector.extract %40[0] : vector<1xf32>
%42 = splat %41 : vector<4xf32>
%43 = vector.fma %42, %26, %39 : vector<4xf32>
%44 = vector.extract_strided_slice %35 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%45 = vector.extract %44[0] : vector<1xf32>
%46 = splat %45 : vector<4xf32>
%47 = vector.fma %46, %27, %43 : vector<4xf32>
%48 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 2)>()[%5, %12, %arg6]
%49 = memref.load %0[%c0, %28, %48, %c0] : memref<1x225x225x3xf32>
%50 = vector.insert %49, %cst_0 [0] : f32 into vector<3xf32>
%51 = memref.load %0[%c0, %28, %48, %c1] : memref<1x225x225x3xf32>
%52 = vector.insert %51, %50 [1] : f32 into vector<3xf32>
%53 = memref.load %0[%c0, %28, %48, %c2] : memref<1x225x225x3xf32>
%54 = vector.insert %53, %52 [2] : f32 into vector<3xf32>
%55 = vector.extract_strided_slice %54 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%56 = vector.extract %55[0] : vector<1xf32>
%57 = splat %56 : vector<4xf32>
%58 = vector.fma %57, %25, %arg8 : vector<4xf32>
%59 = vector.extract_strided_slice %54 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%60 = vector.extract %59[0] : vector<1xf32>
%61 = splat %60 : vector<4xf32>
%62 = vector.fma %61, %26, %58 : vector<4xf32>
%63 = vector.extract_strided_slice %54 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%64 = vector.extract %63[0] : vector<1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %27, %62 : vector<4xf32>
%67 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 4)>()[%5, %12, %arg6]
%68 = memref.load %0[%c0, %28, %67, %c0] : memref<1x225x225x3xf32>
%69 = vector.insert %68, %cst_0 [0] : f32 into vector<3xf32>
%70 = memref.load %0[%c0, %28, %67, %c1] : memref<1x225x225x3xf32>
%71 = vector.insert %70, %69 [1] : f32 into vector<3xf32>
%72 = memref.load %0[%c0, %28, %67, %c2] : memref<1x225x225x3xf32>
%73 = vector.insert %72, %71 [2] : f32 into vector<3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%75 = vector.extract %74[0] : vector<1xf32>
%76 = splat %75 : vector<4xf32>
%77 = vector.fma %76, %25, %arg9 : vector<4xf32>
%78 = vector.extract_strided_slice %73 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%79 = vector.extract %78[0] : vector<1xf32>
%80 = splat %79 : vector<4xf32>
%81 = vector.fma %80, %26, %77 : vector<4xf32>
%82 = vector.extract_strided_slice %73 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%83 = vector.extract %82[0] : vector<1xf32>
%84 = splat %83 : vector<4xf32>
%85 = vector.fma %84, %27, %81 : vector<4xf32>
%86 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 + s2 + 6)>()[%5, %12, %arg6]
%87 = memref.load %0[%c0, %28, %86, %c0] : memref<1x225x225x3xf32>
%88 = vector.insert %87, %cst_0 [0] : f32 into vector<3xf32>
%89 = memref.load %0[%c0, %28, %86, %c1] : memref<1x225x225x3xf32>
%90 = vector.insert %89, %88 [1] : f32 into vector<3xf32>
%91 = memref.load %0[%c0, %28, %86, %c2] : memref<1x225x225x3xf32>
%92 = vector.insert %91, %90 [2] : f32 into vector<3xf32>
%93 = vector.extract_strided_slice %92 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%94 = vector.extract %93[0] : vector<1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.fma %95, %25, %arg10 : vector<4xf32>
%97 = vector.extract_strided_slice %92 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%98 = vector.extract %97[0] : vector<1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.fma %99, %26, %96 : vector<4xf32>
%101 = vector.extract_strided_slice %92 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%102 = vector.extract %101[0] : vector<1xf32>
%103 = splat %102 : vector<4xf32>
%104 = vector.fma %103, %27, %100 : vector<4xf32>
scf.yield %47, %66, %85, %104 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %22#0, %22#1, %22#2, %22#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %8]
%16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 3)>()[%3, %9]
%17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%18 = divi_signed %17, %c4 : index
memref.store %14#3, %2[%c0, %15, %16, %18] : memref<1x112x112x8xvector<4xf32>>
%19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 2)>()[%3, %9]
memref.store %14#2, %2[%c0, %15, %19, %18] : memref<1x112x112x8xvector<4xf32>>
%20 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 + 1)>()[%3, %9]
memref.store %14#1, %2[%c0, %15, %20, %18] : memref<1x112x112x8xvector<4xf32>>
%21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%3, %9]
memref.store %14#0, %2[%c0, %15, %21, %18] : memref<1x112x112x8xvector<4xf32>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After FlattenMemRefSubspan //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = constant dense<0.000000e+00> : vector<3xf32>
%c8 = constant 8 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c16 = constant 16 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%c151875 = constant 151875 : index
%c216 = constant 216 : index
%c100352 = constant 100352 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst, %arg3 = %cst, %arg4 = %cst, %arg5 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%21:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%23 = divi_signed %22, %c4 : index
%24 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2)>(%arg1, %arg6, %23)
%25 = memref.load %1[%24] : memref<?xvector<4xf32>>
%26 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 8)>(%arg1, %arg6, %23)
%27 = memref.load %1[%26] : memref<?xvector<4xf32>>
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 16)>(%arg1, %arg6, %23)
%29 = memref.load %1[%28] : memref<?xvector<4xf32>>
%30 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3)>(%arg1, %arg6)[%13, %11, %5, %12]
%31 = memref.load %0[%30] : memref<?xf32>
%32 = vector.insert %31, %cst_0 [0] : f32 into vector<3xf32>
%33 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 1)>(%arg1, %arg6)[%13, %11, %5, %12]
%34 = memref.load %0[%33] : memref<?xf32>
%35 = vector.insert %34, %32 [1] : f32 into vector<3xf32>
%36 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 2)>(%arg1, %arg6)[%13, %11, %5, %12]
%37 = memref.load %0[%36] : memref<?xf32>
%38 = vector.insert %37, %35 [2] : f32 into vector<3xf32>
%39 = vector.extract_strided_slice %38 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%40 = vector.extract %39[0] : vector<1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.fma %41, %25, %arg7 : vector<4xf32>
%43 = vector.extract_strided_slice %38 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%44 = vector.extract %43[0] : vector<1xf32>
%45 = splat %44 : vector<4xf32>
%46 = vector.fma %45, %27, %42 : vector<4xf32>
%47 = vector.extract_strided_slice %38 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%48 = vector.extract %47[0] : vector<1xf32>
%49 = splat %48 : vector<4xf32>
%50 = vector.fma %49, %29, %46 : vector<4xf32>
%51 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 6)>(%arg1)[%13, %11, %5, %12, %arg6]
%52 = memref.load %0[%51] : memref<?xf32>
%53 = vector.insert %52, %cst_0 [0] : f32 into vector<3xf32>
%54 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 7)>(%arg1)[%13, %11, %5, %12, %arg6]
%55 = memref.load %0[%54] : memref<?xf32>
%56 = vector.insert %55, %53 [1] : f32 into vector<3xf32>
%57 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 8)>(%arg1)[%13, %11, %5, %12, %arg6]
%58 = memref.load %0[%57] : memref<?xf32>
%59 = vector.insert %58, %56 [2] : f32 into vector<3xf32>
%60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%61 = vector.extract %60[0] : vector<1xf32>
%62 = splat %61 : vector<4xf32>
%63 = vector.fma %62, %25, %arg8 : vector<4xf32>
%64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%65 = vector.extract %64[0] : vector<1xf32>
%66 = splat %65 : vector<4xf32>
%67 = vector.fma %66, %27, %63 : vector<4xf32>
%68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract %68[0] : vector<1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %29, %67 : vector<4xf32>
%72 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 12)>(%arg1)[%13, %11, %5, %12, %arg6]
%73 = memref.load %0[%72] : memref<?xf32>
%74 = vector.insert %73, %cst_0 [0] : f32 into vector<3xf32>
%75 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 13)>(%arg1)[%13, %11, %5, %12, %arg6]
%76 = memref.load %0[%75] : memref<?xf32>
%77 = vector.insert %76, %74 [1] : f32 into vector<3xf32>
%78 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 14)>(%arg1)[%13, %11, %5, %12, %arg6]
%79 = memref.load %0[%78] : memref<?xf32>
%80 = vector.insert %79, %77 [2] : f32 into vector<3xf32>
%81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%82 = vector.extract %81[0] : vector<1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %25, %arg9 : vector<4xf32>
%85 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%86 = vector.extract %85[0] : vector<1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.fma %87, %27, %84 : vector<4xf32>
%89 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%90 = vector.extract %89[0] : vector<1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.fma %91, %29, %88 : vector<4xf32>
%93 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 18)>(%arg1)[%13, %11, %5, %12, %arg6]
%94 = memref.load %0[%93] : memref<?xf32>
%95 = vector.insert %94, %cst_0 [0] : f32 into vector<3xf32>
%96 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 19)>(%arg1)[%13, %11, %5, %12, %arg6]
%97 = memref.load %0[%96] : memref<?xf32>
%98 = vector.insert %97, %95 [1] : f32 into vector<3xf32>
%99 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 20)>(%arg1)[%13, %11, %5, %12, %arg6]
%100 = memref.load %0[%99] : memref<?xf32>
%101 = vector.insert %100, %98 [2] : f32 into vector<3xf32>
%102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%103 = vector.extract %102[0] : vector<1xf32>
%104 = splat %103 : vector<4xf32>
%105 = vector.fma %104, %25, %arg10 : vector<4xf32>
%106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%107 = vector.extract %106[0] : vector<1xf32>
%108 = splat %107 : vector<4xf32>
%109 = vector.fma %108, %27, %105 : vector<4xf32>
%110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%111 = vector.extract %110[0] : vector<1xf32>
%112 = splat %111 : vector<4xf32>
%113 = vector.fma %112, %29, %109 : vector<4xf32>
scf.yield %50, %71, %92, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %21#0, %21#1, %21#2, %21#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%16 = divi_signed %15, %c4 : index
%17 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 24)>(%16)[%arg0, %8, %3, %9]
memref.store %14#3, %2[%17] : memref<?xvector<4xf32>>
%18 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 16)>(%16)[%arg0, %8, %3, %9]
memref.store %14#2, %2[%18] : memref<?xvector<4xf32>>
%19 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 8)>(%16)[%arg0, %8, %3, %9]
memref.store %14#1, %2[%19] : memref<?xvector<4xf32>>
%20 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8)>(%16)[%arg0, %8, %3, %9]
memref.store %14#0, %2[%20] : memref<?xvector<4xf32>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%c100352 = constant 100352 : index
%c216 = constant 216 : index
%c151875 = constant 151875 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%cst = constant dense<0.000000e+00> : vector<3xf32>
%cst_0 = constant dense<0.000000e+00> : vector<4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst_0, %arg3 = %cst_0, %arg4 = %cst_0, %arg5 = %cst_0) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%21:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%23 = divi_signed %22, %c4 : index
%24 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2)>(%arg1, %arg6, %23)
%25 = memref.load %1[%24] : memref<?xvector<4xf32>>
%26 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 8)>(%arg1, %arg6, %23)
%27 = memref.load %1[%26] : memref<?xvector<4xf32>>
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 16)>(%arg1, %arg6, %23)
%29 = memref.load %1[%28] : memref<?xvector<4xf32>>
%30 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3)>(%arg1, %arg6)[%13, %11, %5, %12]
%31 = memref.load %0[%30] : memref<?xf32>
%32 = vector.insert %31, %cst [0] : f32 into vector<3xf32>
%33 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 1)>(%arg1, %arg6)[%13, %11, %5, %12]
%34 = memref.load %0[%33] : memref<?xf32>
%35 = vector.insert %34, %32 [1] : f32 into vector<3xf32>
%36 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 2)>(%arg1, %arg6)[%13, %11, %5, %12]
%37 = memref.load %0[%36] : memref<?xf32>
%38 = vector.insert %37, %35 [2] : f32 into vector<3xf32>
%39 = vector.extract_strided_slice %38 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%40 = vector.extract %39[0] : vector<1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.fma %41, %25, %arg7 : vector<4xf32>
%43 = vector.extract_strided_slice %38 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%44 = vector.extract %43[0] : vector<1xf32>
%45 = splat %44 : vector<4xf32>
%46 = vector.fma %45, %27, %42 : vector<4xf32>
%47 = vector.extract_strided_slice %38 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%48 = vector.extract %47[0] : vector<1xf32>
%49 = splat %48 : vector<4xf32>
%50 = vector.fma %49, %29, %46 : vector<4xf32>
%51 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 6)>(%arg1)[%13, %11, %5, %12, %arg6]
%52 = memref.load %0[%51] : memref<?xf32>
%53 = vector.insert %52, %cst [0] : f32 into vector<3xf32>
%54 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 7)>(%arg1)[%13, %11, %5, %12, %arg6]
%55 = memref.load %0[%54] : memref<?xf32>
%56 = vector.insert %55, %53 [1] : f32 into vector<3xf32>
%57 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 8)>(%arg1)[%13, %11, %5, %12, %arg6]
%58 = memref.load %0[%57] : memref<?xf32>
%59 = vector.insert %58, %56 [2] : f32 into vector<3xf32>
%60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%61 = vector.extract %60[0] : vector<1xf32>
%62 = splat %61 : vector<4xf32>
%63 = vector.fma %62, %25, %arg8 : vector<4xf32>
%64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%65 = vector.extract %64[0] : vector<1xf32>
%66 = splat %65 : vector<4xf32>
%67 = vector.fma %66, %27, %63 : vector<4xf32>
%68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract %68[0] : vector<1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %29, %67 : vector<4xf32>
%72 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 12)>(%arg1)[%13, %11, %5, %12, %arg6]
%73 = memref.load %0[%72] : memref<?xf32>
%74 = vector.insert %73, %cst [0] : f32 into vector<3xf32>
%75 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 13)>(%arg1)[%13, %11, %5, %12, %arg6]
%76 = memref.load %0[%75] : memref<?xf32>
%77 = vector.insert %76, %74 [1] : f32 into vector<3xf32>
%78 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 14)>(%arg1)[%13, %11, %5, %12, %arg6]
%79 = memref.load %0[%78] : memref<?xf32>
%80 = vector.insert %79, %77 [2] : f32 into vector<3xf32>
%81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%82 = vector.extract %81[0] : vector<1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %25, %arg9 : vector<4xf32>
%85 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%86 = vector.extract %85[0] : vector<1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.fma %87, %27, %84 : vector<4xf32>
%89 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%90 = vector.extract %89[0] : vector<1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.fma %91, %29, %88 : vector<4xf32>
%93 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 18)>(%arg1)[%13, %11, %5, %12, %arg6]
%94 = memref.load %0[%93] : memref<?xf32>
%95 = vector.insert %94, %cst [0] : f32 into vector<3xf32>
%96 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 19)>(%arg1)[%13, %11, %5, %12, %arg6]
%97 = memref.load %0[%96] : memref<?xf32>
%98 = vector.insert %97, %95 [1] : f32 into vector<3xf32>
%99 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 20)>(%arg1)[%13, %11, %5, %12, %arg6]
%100 = memref.load %0[%99] : memref<?xf32>
%101 = vector.insert %100, %98 [2] : f32 into vector<3xf32>
%102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%103 = vector.extract %102[0] : vector<1xf32>
%104 = splat %103 : vector<4xf32>
%105 = vector.fma %104, %25, %arg10 : vector<4xf32>
%106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%107 = vector.extract %106[0] : vector<1xf32>
%108 = splat %107 : vector<4xf32>
%109 = vector.fma %108, %27, %105 : vector<4xf32>
%110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%111 = vector.extract %110[0] : vector<1xf32>
%112 = splat %111 : vector<4xf32>
%113 = vector.fma %112, %29, %109 : vector<4xf32>
scf.yield %50, %71, %92, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %21#0, %21#1, %21#2, %21#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%16 = divi_signed %15, %c4 : index
%17 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 24)>(%16)[%arg0, %8, %3, %9]
memref.store %14#3, %2[%17] : memref<?xvector<4xf32>>
%18 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 16)>(%16)[%arg0, %8, %3, %9]
memref.store %14#2, %2[%18] : memref<?xvector<4xf32>>
%19 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 8)>(%16)[%arg0, %8, %3, %9]
memref.store %14#1, %2[%19] : memref<?xvector<4xf32>>
%20 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8)>(%16)[%arg0, %8, %3, %9]
memref.store %14#0, %2[%20] : memref<?xvector<4xf32>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
func @conv_dispatch_0() {
%c100352 = constant 100352 : index
%c216 = constant 216 : index
%c151875 = constant 151875 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%cst = constant dense<0.000000e+00> : vector<3xf32>
%cst_0 = constant dense<0.000000e+00> : vector<4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst_0, %arg3 = %cst_0, %arg4 = %cst_0, %arg5 = %cst_0) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%21:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%23 = divi_signed %22, %c4 : index
%24 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2)>(%arg1, %arg6, %23)
%25 = memref.load %1[%24] : memref<?xvector<4xf32>>
%26 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 8)>(%arg1, %arg6, %23)
%27 = memref.load %1[%26] : memref<?xvector<4xf32>>
%28 = affine.apply affine_map<(d0, d1, d2) -> (d0 * 72 + d1 * 24 + d2 + 16)>(%arg1, %arg6, %23)
%29 = memref.load %1[%28] : memref<?xvector<4xf32>>
%30 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3)>(%arg1, %arg6)[%13, %11, %5, %12]
%31 = memref.load %0[%30] : memref<?xf32>
%32 = vector.insert %31, %cst [0] : f32 into vector<3xf32>
%33 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 1)>(%arg1, %arg6)[%13, %11, %5, %12]
%34 = memref.load %0[%33] : memref<?xf32>
%35 = vector.insert %34, %32 [1] : f32 into vector<3xf32>
%36 = affine.apply affine_map<(d0, d1)[s0, s1, s2, s3] -> (d0 * 675 + d1 * 3 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + 2)>(%arg1, %arg6)[%13, %11, %5, %12]
%37 = memref.load %0[%36] : memref<?xf32>
%38 = vector.insert %37, %35 [2] : f32 into vector<3xf32>
%39 = vector.extract_strided_slice %38 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%40 = vector.extract %39[0] : vector<1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.fma %41, %25, %arg7 : vector<4xf32>
%43 = vector.extract_strided_slice %38 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%44 = vector.extract %43[0] : vector<1xf32>
%45 = splat %44 : vector<4xf32>
%46 = vector.fma %45, %27, %42 : vector<4xf32>
%47 = vector.extract_strided_slice %38 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%48 = vector.extract %47[0] : vector<1xf32>
%49 = splat %48 : vector<4xf32>
%50 = vector.fma %49, %29, %46 : vector<4xf32>
%51 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 6)>(%arg1)[%13, %11, %5, %12, %arg6]
%52 = memref.load %0[%51] : memref<?xf32>
%53 = vector.insert %52, %cst [0] : f32 into vector<3xf32>
%54 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 7)>(%arg1)[%13, %11, %5, %12, %arg6]
%55 = memref.load %0[%54] : memref<?xf32>
%56 = vector.insert %55, %53 [1] : f32 into vector<3xf32>
%57 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 8)>(%arg1)[%13, %11, %5, %12, %arg6]
%58 = memref.load %0[%57] : memref<?xf32>
%59 = vector.insert %58, %56 [2] : f32 into vector<3xf32>
%60 = vector.extract_strided_slice %59 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%61 = vector.extract %60[0] : vector<1xf32>
%62 = splat %61 : vector<4xf32>
%63 = vector.fma %62, %25, %arg8 : vector<4xf32>
%64 = vector.extract_strided_slice %59 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%65 = vector.extract %64[0] : vector<1xf32>
%66 = splat %65 : vector<4xf32>
%67 = vector.fma %66, %27, %63 : vector<4xf32>
%68 = vector.extract_strided_slice %59 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract %68[0] : vector<1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %29, %67 : vector<4xf32>
%72 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 12)>(%arg1)[%13, %11, %5, %12, %arg6]
%73 = memref.load %0[%72] : memref<?xf32>
%74 = vector.insert %73, %cst [0] : f32 into vector<3xf32>
%75 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 13)>(%arg1)[%13, %11, %5, %12, %arg6]
%76 = memref.load %0[%75] : memref<?xf32>
%77 = vector.insert %76, %74 [1] : f32 into vector<3xf32>
%78 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 14)>(%arg1)[%13, %11, %5, %12, %arg6]
%79 = memref.load %0[%78] : memref<?xf32>
%80 = vector.insert %79, %77 [2] : f32 into vector<3xf32>
%81 = vector.extract_strided_slice %80 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%82 = vector.extract %81[0] : vector<1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %25, %arg9 : vector<4xf32>
%85 = vector.extract_strided_slice %80 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%86 = vector.extract %85[0] : vector<1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.fma %87, %27, %84 : vector<4xf32>
%89 = vector.extract_strided_slice %80 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%90 = vector.extract %89[0] : vector<1xf32>
%91 = splat %90 : vector<4xf32>
%92 = vector.fma %91, %29, %88 : vector<4xf32>
%93 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 18)>(%arg1)[%13, %11, %5, %12, %arg6]
%94 = memref.load %0[%93] : memref<?xf32>
%95 = vector.insert %94, %cst [0] : f32 into vector<3xf32>
%96 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 19)>(%arg1)[%13, %11, %5, %12, %arg6]
%97 = memref.load %0[%96] : memref<?xf32>
%98 = vector.insert %97, %95 [1] : f32 into vector<3xf32>
%99 = affine.apply affine_map<(d0)[s0, s1, s2, s3, s4] -> (d0 * 675 + s0 * 675 + s1 * 675 + s2 * 3 + s3 * 3 + s4 * 3 + 20)>(%arg1)[%13, %11, %5, %12, %arg6]
%100 = memref.load %0[%99] : memref<?xf32>
%101 = vector.insert %100, %98 [2] : f32 into vector<3xf32>
%102 = vector.extract_strided_slice %101 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%103 = vector.extract %102[0] : vector<1xf32>
%104 = splat %103 : vector<4xf32>
%105 = vector.fma %104, %25, %arg10 : vector<4xf32>
%106 = vector.extract_strided_slice %101 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%107 = vector.extract %106[0] : vector<1xf32>
%108 = splat %107 : vector<4xf32>
%109 = vector.fma %108, %27, %105 : vector<4xf32>
%110 = vector.extract_strided_slice %101 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%111 = vector.extract %110[0] : vector<1xf32>
%112 = splat %111 : vector<4xf32>
%113 = vector.fma %112, %29, %109 : vector<4xf32>
scf.yield %50, %71, %92, %113 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %21#0, %21#1, %21#2, %21#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%4, %10]
%16 = divi_signed %15, %c4 : index
%17 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 24)>(%16)[%arg0, %8, %3, %9]
memref.store %14#3, %2[%17] : memref<?xvector<4xf32>>
%18 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 16)>(%16)[%arg0, %8, %3, %9]
memref.store %14#2, %2[%18] : memref<?xvector<4xf32>>
%19 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8 + 8)>(%16)[%arg0, %8, %3, %9]
memref.store %14#1, %2[%19] : memref<?xvector<4xf32>>
%20 = affine.apply affine_map<(d0)[s0, s1, s2, s3] -> (d0 + s0 * 896 + s1 * 896 + s2 * 8 + s3 * 8)>(%16)[%arg0, %8, %3, %9]
memref.store %14#0, %2[%20] : memref<?xvector<4xf32>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After ConvertAffineToStandard //----- //
module {
func @conv_dispatch_0() {
%c100352 = constant 100352 : index
%c216 = constant 216 : index
%c151875 = constant 151875 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c32 = constant 32 : index
%c16 = constant 16 : index
%c4 = constant 4 : index
%c2 = constant 2 : index
%c8 = constant 8 : index
%cst = constant dense<0.000000e+00> : vector<3xf32>
%cst_0 = constant dense<0.000000e+00> : vector<4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<?xf32>{%c151875}
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<?xvector<4xf32>>{%c216}
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<?xvector<4xf32>>{%c100352}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_y, %c8 : index
%4 = muli %workgroup_id_x, %c32 : index
%5 = muli %workgroup_id_y, %c16 : index
%6 = "gpu.thread_id"() {dimension = "x"} : () -> index
%7 = "gpu.thread_id"() {dimension = "y"} : () -> index
%8 = "gpu.thread_id"() {dimension = "z"} : () -> index
%9 = muli %7, %c4 : index
%10 = muli %6, %c4 : index
%11 = muli %8, %c2 : index
%12 = muli %7, %c8 : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%13 = muli %arg0, %c2 : index
%14:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %cst_0, %arg3 = %cst_0, %arg4 = %cst_0, %arg5 = %cst_0) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%52:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%53 = addi %4, %10 : index
%54 = divi_signed %53, %c4 : index
%c72 = constant 72 : index
%55 = muli %arg1, %c72 : index
%c24_18 = constant 24 : index
%56 = muli %arg6, %c24_18 : index
%57 = addi %55, %56 : index
%58 = addi %57, %54 : index
%59 = memref.load %1[%58] : memref<?xvector<4xf32>>
%c72_19 = constant 72 : index
%60 = muli %arg1, %c72_19 : index
%c24_20 = constant 24 : index
%61 = muli %arg6, %c24_20 : index
%62 = addi %60, %61 : index
%63 = addi %62, %54 : index
%c8_21 = constant 8 : index
%64 = addi %63, %c8_21 : index
%65 = memref.load %1[%64] : memref<?xvector<4xf32>>
%c72_22 = constant 72 : index
%66 = muli %arg1, %c72_22 : index
%c24_23 = constant 24 : index
%67 = muli %arg6, %c24_23 : index
%68 = addi %66, %67 : index
%69 = addi %68, %54 : index
%c16_24 = constant 16 : index
%70 = addi %69, %c16_24 : index
%71 = memref.load %1[%70] : memref<?xvector<4xf32>>
%c675 = constant 675 : index
%72 = muli %arg1, %c675 : index
%c3_25 = constant 3 : index
%73 = muli %arg6, %c3_25 : index
%74 = addi %72, %73 : index
%c675_26 = constant 675 : index
%75 = muli %13, %c675_26 : index
%76 = addi %74, %75 : index
%c675_27 = constant 675 : index
%77 = muli %11, %c675_27 : index
%78 = addi %76, %77 : index
%c3_28 = constant 3 : index
%79 = muli %5, %c3_28 : index
%80 = addi %78, %79 : index
%c3_29 = constant 3 : index
%81 = muli %12, %c3_29 : index
%82 = addi %80, %81 : index
%83 = memref.load %0[%82] : memref<?xf32>
%84 = vector.insert %83, %cst [0] : f32 into vector<3xf32>
%c675_30 = constant 675 : index
%85 = muli %arg1, %c675_30 : index
%c3_31 = constant 3 : index
%86 = muli %arg6, %c3_31 : index
%87 = addi %85, %86 : index
%c675_32 = constant 675 : index
%88 = muli %13, %c675_32 : index
%89 = addi %87, %88 : index
%c675_33 = constant 675 : index
%90 = muli %11, %c675_33 : index
%91 = addi %89, %90 : index
%c3_34 = constant 3 : index
%92 = muli %5, %c3_34 : index
%93 = addi %91, %92 : index
%c3_35 = constant 3 : index
%94 = muli %12, %c3_35 : index
%95 = addi %93, %94 : index
%c1_36 = constant 1 : index
%96 = addi %95, %c1_36 : index
%97 = memref.load %0[%96] : memref<?xf32>
%98 = vector.insert %97, %84 [1] : f32 into vector<3xf32>
%c675_37 = constant 675 : index
%99 = muli %arg1, %c675_37 : index
%c3_38 = constant 3 : index
%100 = muli %arg6, %c3_38 : index
%101 = addi %99, %100 : index
%c675_39 = constant 675 : index
%102 = muli %13, %c675_39 : index
%103 = addi %101, %102 : index
%c675_40 = constant 675 : index
%104 = muli %11, %c675_40 : index
%105 = addi %103, %104 : index
%c3_41 = constant 3 : index
%106 = muli %5, %c3_41 : index
%107 = addi %105, %106 : index
%c3_42 = constant 3 : index
%108 = muli %12, %c3_42 : index
%109 = addi %107, %108 : index
%c2_43 = constant 2 : index
%110 = addi %109, %c2_43 : index
%111 = memref.load %0[%110] : memref<?xf32>
%112 = vector.insert %111, %98 [2] : f32 into vector<3xf32>
%113 = vector.extract_strided_slice %112 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%114 = vector.extract %113[0] : vector<1xf32>
%115 = splat %114 : vector<4xf32>
%116 = vector.fma %115, %59, %arg7 : vector<4xf32>
%117 = vector.extract_strided_slice %112 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%118 = vector.extract %117[0] : vector<1xf32>
%119 = splat %118 : vector<4xf32>
%120 = vector.fma %119, %65, %116 : vector<4xf32>
%121 = vector.extract_strided_slice %112 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%122 = vector.extract %121[0] : vector<1xf32>
%123 = splat %122 : vector<4xf32>
%124 = vector.fma %123, %71, %120 : vector<4xf32>
%c675_44 = constant 675 : index
%125 = muli %arg1, %c675_44 : index
%c675_45 = constant 675 : index
%126 = muli %13, %c675_45 : index
%127 = addi %125, %126 : index
%c675_46 = constant 675 : index
%128 = muli %11, %c675_46 : index
%129 = addi %127, %128 : index
%c3_47 = constant 3 : index
%130 = muli %5, %c3_47 : index
%131 = addi %129, %130 : index
%c3_48 = constant 3 : index
%132 = muli %12, %c3_48 : index
%133 = addi %131, %132 : index
%c3_49 = constant 3 : index
%134 = muli %arg6, %c3_49 : index
%135 = addi %133, %134 : index
%c6 = constant 6 : index
%136 = addi %135, %c6 : index
%137 = memref.load %0[%136] : memref<?xf32>
%138 = vector.insert %137, %cst [0] : f32 into vector<3xf32>
%c675_50 = constant 675 : index
%139 = muli %arg1, %c675_50 : index
%c675_51 = constant 675 : index
%140 = muli %13, %c675_51 : index
%141 = addi %139, %140 : index
%c675_52 = constant 675 : index
%142 = muli %11, %c675_52 : index
%143 = addi %141, %142 : index
%c3_53 = constant 3 : index
%144 = muli %5, %c3_53 : index
%145 = addi %143, %144 : index
%c3_54 = constant 3 : index
%146 = muli %12, %c3_54 : index
%147 = addi %145, %146 : index
%c3_55 = constant 3 : index
%148 = muli %arg6, %c3_55 : index
%149 = addi %147, %148 : index
%c7 = constant 7 : index
%150 = addi %149, %c7 : index
%151 = memref.load %0[%150] : memref<?xf32>
%152 = vector.insert %151, %138 [1] : f32 into vector<3xf32>
%c675_56 = constant 675 : index
%153 = muli %arg1, %c675_56 : index
%c675_57 = constant 675 : index
%154 = muli %13, %c675_57 : index
%155 = addi %153, %154 : index
%c675_58 = constant 675 : index
%156 = muli %11, %c675_58 : index
%157 = addi %155, %156 : index
%c3_59 = constant 3 : index
%158 = muli %5, %c3_59 : index
%159 = addi %157, %158 : index
%c3_60 = constant 3 : index
%160 = muli %12, %c3_60 : index
%161 = addi %159, %160 : index
%c3_61 = constant 3 : index
%162 = muli %arg6, %c3_61 : index
%163 = addi %161, %162 : index
%c8_62 = constant 8 : index
%164 = addi %163, %c8_62 : index
%165 = memref.load %0[%164] : memref<?xf32>
%166 = vector.insert %165, %152 [2] : f32 into vector<3xf32>
%167 = vector.extract_strided_slice %166 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%168 = vector.extract %167[0] : vector<1xf32>
%169 = splat %168 : vector<4xf32>
%170 = vector.fma %169, %59, %arg8 : vector<4xf32>
%171 = vector.extract_strided_slice %166 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%172 = vector.extract %171[0] : vector<1xf32>
%173 = splat %172 : vector<4xf32>
%174 = vector.fma %173, %65, %170 : vector<4xf32>
%175 = vector.extract_strided_slice %166 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%176 = vector.extract %175[0] : vector<1xf32>
%177 = splat %176 : vector<4xf32>
%178 = vector.fma %177, %71, %174 : vector<4xf32>
%c675_63 = constant 675 : index
%179 = muli %arg1, %c675_63 : index
%c675_64 = constant 675 : index
%180 = muli %13, %c675_64 : index
%181 = addi %179, %180 : index
%c675_65 = constant 675 : index
%182 = muli %11, %c675_65 : index
%183 = addi %181, %182 : index
%c3_66 = constant 3 : index
%184 = muli %5, %c3_66 : index
%185 = addi %183, %184 : index
%c3_67 = constant 3 : index
%186 = muli %12, %c3_67 : index
%187 = addi %185, %186 : index
%c3_68 = constant 3 : index
%188 = muli %arg6, %c3_68 : index
%189 = addi %187, %188 : index
%c12 = constant 12 : index
%190 = addi %189, %c12 : index
%191 = memref.load %0[%190] : memref<?xf32>
%192 = vector.insert %191, %cst [0] : f32 into vector<3xf32>
%c675_69 = constant 675 : index
%193 = muli %arg1, %c675_69 : index
%c675_70 = constant 675 : index
%194 = muli %13, %c675_70 : index
%195 = addi %193, %194 : index
%c675_71 = constant 675 : index
%196 = muli %11, %c675_71 : index
%197 = addi %195, %196 : index
%c3_72 = constant 3 : index
%198 = muli %5, %c3_72 : index
%199 = addi %197, %198 : index
%c3_73 = constant 3 : index
%200 = muli %12, %c3_73 : index
%201 = addi %199, %200 : index
%c3_74 = constant 3 : index
%202 = muli %arg6, %c3_74 : index
%203 = addi %201, %202 : index
%c13 = constant 13 : index
%204 = addi %203, %c13 : index
%205 = memref.load %0[%204] : memref<?xf32>
%206 = vector.insert %205, %192 [1] : f32 into vector<3xf32>
%c675_75 = constant 675 : index
%207 = muli %arg1, %c675_75 : index
%c675_76 = constant 675 : index
%208 = muli %13, %c675_76 : index
%209 = addi %207, %208 : index
%c675_77 = constant 675 : index
%210 = muli %11, %c675_77 : index
%211 = addi %209, %210 : index
%c3_78 = constant 3 : index
%212 = muli %5, %c3_78 : index
%213 = addi %211, %212 : index
%c3_79 = constant 3 : index
%214 = muli %12, %c3_79 : index
%215 = addi %213, %214 : index
%c3_80 = constant 3 : index
%216 = muli %arg6, %c3_80 : index
%217 = addi %215, %216 : index
%c14 = constant 14 : index
%218 = addi %217, %c14 : index
%219 = memref.load %0[%218] : memref<?xf32>
%220 = vector.insert %219, %206 [2] : f32 into vector<3xf32>
%221 = vector.extract_strided_slice %220 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%222 = vector.extract %221[0] : vector<1xf32>
%223 = splat %222 : vector<4xf32>
%224 = vector.fma %223, %59, %arg9 : vector<4xf32>
%225 = vector.extract_strided_slice %220 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%226 = vector.extract %225[0] : vector<1xf32>
%227 = splat %226 : vector<4xf32>
%228 = vector.fma %227, %65, %224 : vector<4xf32>
%229 = vector.extract_strided_slice %220 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%230 = vector.extract %229[0] : vector<1xf32>
%231 = splat %230 : vector<4xf32>
%232 = vector.fma %231, %71, %228 : vector<4xf32>
%c675_81 = constant 675 : index
%233 = muli %arg1, %c675_81 : index
%c675_82 = constant 675 : index
%234 = muli %13, %c675_82 : index
%235 = addi %233, %234 : index
%c675_83 = constant 675 : index
%236 = muli %11, %c675_83 : index
%237 = addi %235, %236 : index
%c3_84 = constant 3 : index
%238 = muli %5, %c3_84 : index
%239 = addi %237, %238 : index
%c3_85 = constant 3 : index
%240 = muli %12, %c3_85 : index
%241 = addi %239, %240 : index
%c3_86 = constant 3 : index
%242 = muli %arg6, %c3_86 : index
%243 = addi %241, %242 : index
%c18 = constant 18 : index
%244 = addi %243, %c18 : index
%245 = memref.load %0[%244] : memref<?xf32>
%246 = vector.insert %245, %cst [0] : f32 into vector<3xf32>
%c675_87 = constant 675 : index
%247 = muli %arg1, %c675_87 : index
%c675_88 = constant 675 : index
%248 = muli %13, %c675_88 : index
%249 = addi %247, %248 : index
%c675_89 = constant 675 : index
%250 = muli %11, %c675_89 : index
%251 = addi %249, %250 : index
%c3_90 = constant 3 : index
%252 = muli %5, %c3_90 : index
%253 = addi %251, %252 : index
%c3_91 = constant 3 : index
%254 = muli %12, %c3_91 : index
%255 = addi %253, %254 : index
%c3_92 = constant 3 : index
%256 = muli %arg6, %c3_92 : index
%257 = addi %255, %256 : index
%c19 = constant 19 : index
%258 = addi %257, %c19 : index
%259 = memref.load %0[%258] : memref<?xf32>
%260 = vector.insert %259, %246 [1] : f32 into vector<3xf32>
%c675_93 = constant 675 : index
%261 = muli %arg1, %c675_93 : index
%c675_94 = constant 675 : index
%262 = muli %13, %c675_94 : index
%263 = addi %261, %262 : index
%c675_95 = constant 675 : index
%264 = muli %11, %c675_95 : index
%265 = addi %263, %264 : index
%c3_96 = constant 3 : index
%266 = muli %5, %c3_96 : index
%267 = addi %265, %266 : index
%c3_97 = constant 3 : index
%268 = muli %12, %c3_97 : index
%269 = addi %267, %268 : index
%c3_98 = constant 3 : index
%270 = muli %arg6, %c3_98 : index
%271 = addi %269, %270 : index
%c20 = constant 20 : index
%272 = addi %271, %c20 : index
%273 = memref.load %0[%272] : memref<?xf32>
%274 = vector.insert %273, %260 [2] : f32 into vector<3xf32>
%275 = vector.extract_strided_slice %274 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%276 = vector.extract %275[0] : vector<1xf32>
%277 = splat %276 : vector<4xf32>
%278 = vector.fma %277, %59, %arg10 : vector<4xf32>
%279 = vector.extract_strided_slice %274 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%280 = vector.extract %279[0] : vector<1xf32>
%281 = splat %280 : vector<4xf32>
%282 = vector.fma %281, %65, %278 : vector<4xf32>
%283 = vector.extract_strided_slice %274 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%284 = vector.extract %283[0] : vector<1xf32>
%285 = splat %284 : vector<4xf32>
%286 = vector.fma %285, %71, %282 : vector<4xf32>
scf.yield %124, %178, %232, %286 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %52#0, %52#1, %52#2, %52#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%15 = addi %4, %10 : index
%16 = divi_signed %15, %c4 : index
%c896 = constant 896 : index
%17 = muli %arg0, %c896 : index
%18 = addi %16, %17 : index
%c896_1 = constant 896 : index
%19 = muli %8, %c896_1 : index
%20 = addi %18, %19 : index
%c8_2 = constant 8 : index
%21 = muli %3, %c8_2 : index
%22 = addi %20, %21 : index
%c8_3 = constant 8 : index
%23 = muli %9, %c8_3 : index
%24 = addi %22, %23 : index
%c24 = constant 24 : index
%25 = addi %24, %c24 : index
memref.store %14#3, %2[%25] : memref<?xvector<4xf32>>
%c896_4 = constant 896 : index
%26 = muli %arg0, %c896_4 : index
%27 = addi %16, %26 : index
%c896_5 = constant 896 : index
%28 = muli %8, %c896_5 : index
%29 = addi %27, %28 : index
%c8_6 = constant 8 : index
%30 = muli %3, %c8_6 : index
%31 = addi %29, %30 : index
%c8_7 = constant 8 : index
%32 = muli %9, %c8_7 : index
%33 = addi %31, %32 : index
%c16_8 = constant 16 : index
%34 = addi %33, %c16_8 : index
memref.store %14#2, %2[%34] : memref<?xvector<4xf32>>
%c896_9 = constant 896 : index
%35 = muli %arg0, %c896_9 : index
%36 = addi %16, %35 : index
%c896_10 = constant 896 : index
%37 = muli %8, %c896_10 : index
%38 = addi %36, %37 : index
%c8_11 = constant 8 : index
%39 = muli %3, %c8_11 : index
%40 = addi %38, %39 : index
%c8_12 = constant 8 : index
%41 = muli %9, %c8_12 : index
%42 = addi %40, %41 : index
%c8_13 = constant 8 : index
%43 = addi %42, %c8_13 : index
memref.store %14#1, %2[%43] : memref<?xvector<4xf32>>
%c896_14 = constant 896 : index
%44 = muli %arg0, %c896_14 : index
%45 = addi %16, %44 : index
%c896_15 = constant 896 : index
%46 = muli %8, %c896_15 : index
%47 = addi %45, %46 : index
%c8_16 = constant 8 : index
%48 = muli %3, %c8_16 : index
%49 = addi %47, %48 : index
%c8_17 = constant 8 : index
%50 = muli %9, %c8_17 : index
%51 = addi %49, %50 : index
memref.store %14#0, %2[%51] : memref<?xvector<4xf32>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After ConvertToSPIRV //----- //
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" attributes {spv.entry_point_abi = {local_size = dense<[8, 2, 1]> : vector<3xi32>}} {
%cst100352_i32 = spv.Constant 100352 : i32
%cst216_i32 = spv.Constant 216 : i32
%cst151875_i32 = spv.Constant 151875 : i32
%cst1_i32 = spv.Constant 1 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst8_i32 = spv.Constant 8 : i32
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_0 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_1 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_1 : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%__builtin_var_LocalInvocationId___addr_2 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_2 : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%__builtin_var_LocalInvocationId___addr_3 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_3 : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%80: i32, %81: vector<4xf32>, %82: vector<4xf32>, %83: vector<4xf32>, %84: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%85 = spv.SLessThan %80, %cst3_i32 : i32
spv.BranchConditional %85, ^bb2, ^bb3
^bb2: // pred: ^bb1
%86 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%87 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%88 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%89 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %81, %82, %83, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%95: i32, %96: vector<4xf32>, %97: vector<4xf32>, %98: vector<4xf32>, %99: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%100 = spv.SLessThan %95, %cst3_i32 : i32
spv.BranchConditional %100, ^bb2, ^bb3
^bb2: // pred: ^bb1
%101 = spv.IAdd %7, %16 : i32
%102 = spv.SDiv %101, %cst4_i32 : i32
%cst72_i32 = spv.Constant 72 : i32
%103 = spv.IMul %80, %cst72_i32 : i32
%cst24_i32_33 = spv.Constant 24 : i32
%104 = spv.IMul %95, %cst24_i32_33 : i32
%105 = spv.IAdd %103, %104 : i32
%106 = spv.IAdd %105, %102 : i32
%cst0_i32_34 = spv.Constant 0 : i32
%cst0_i32_35 = spv.Constant 0 : i32
%cst1_i32_36 = spv.Constant 1 : i32
%107 = spv.IMul %cst1_i32_36, %106 : i32
%108 = spv.IAdd %cst0_i32_35, %107 : i32
%109 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_34, %108] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%cst72_i32_37 = spv.Constant 72 : i32
%111 = spv.IMul %80, %cst72_i32_37 : i32
%cst24_i32_38 = spv.Constant 24 : i32
%112 = spv.IMul %95, %cst24_i32_38 : i32
%113 = spv.IAdd %111, %112 : i32
%114 = spv.IAdd %113, %102 : i32
%cst8_i32_39 = spv.Constant 8 : i32
%115 = spv.IAdd %114, %cst8_i32_39 : i32
%cst0_i32_40 = spv.Constant 0 : i32
%cst0_i32_41 = spv.Constant 0 : i32
%cst1_i32_42 = spv.Constant 1 : i32
%116 = spv.IMul %cst1_i32_42, %115 : i32
%117 = spv.IAdd %cst0_i32_41, %116 : i32
%118 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_40, %117] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : vector<4xf32>
%cst72_i32_43 = spv.Constant 72 : i32
%120 = spv.IMul %80, %cst72_i32_43 : i32
%cst24_i32_44 = spv.Constant 24 : i32
%121 = spv.IMul %95, %cst24_i32_44 : i32
%122 = spv.IAdd %120, %121 : i32
%123 = spv.IAdd %122, %102 : i32
%cst16_i32_45 = spv.Constant 16 : i32
%124 = spv.IAdd %123, %cst16_i32_45 : i32
%cst0_i32_46 = spv.Constant 0 : i32
%cst0_i32_47 = spv.Constant 0 : i32
%cst1_i32_48 = spv.Constant 1 : i32
%125 = spv.IMul %cst1_i32_48, %124 : i32
%126 = spv.IAdd %cst0_i32_47, %125 : i32
%127 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_46, %126] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%128 = spv.Load "StorageBuffer" %127 : vector<4xf32>
%cst675_i32 = spv.Constant 675 : i32
%129 = spv.IMul %80, %cst675_i32 : i32
%cst3_i32_49 = spv.Constant 3 : i32
%130 = spv.IMul %95, %cst3_i32_49 : i32
%131 = spv.IAdd %129, %130 : i32
%cst675_i32_50 = spv.Constant 675 : i32
%132 = spv.IMul %21, %cst675_i32_50 : i32
%133 = spv.IAdd %131, %132 : i32
%cst675_i32_51 = spv.Constant 675 : i32
%134 = spv.IMul %17, %cst675_i32_51 : i32
%135 = spv.IAdd %133, %134 : i32
%cst3_i32_52 = spv.Constant 3 : i32
%136 = spv.IMul %8, %cst3_i32_52 : i32
%137 = spv.IAdd %135, %136 : i32
%cst3_i32_53 = spv.Constant 3 : i32
%138 = spv.IMul %18, %cst3_i32_53 : i32
%139 = spv.IAdd %137, %138 : i32
%cst0_i32_54 = spv.Constant 0 : i32
%cst0_i32_55 = spv.Constant 0 : i32
%cst1_i32_56 = spv.Constant 1 : i32
%140 = spv.IMul %cst1_i32_56, %139 : i32
%141 = spv.IAdd %cst0_i32_55, %140 : i32
%142 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_54, %141] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%143 = spv.Load "StorageBuffer" %142 : f32
%144 = spv.CompositeInsert %143, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_57 = spv.Constant 675 : i32
%145 = spv.IMul %80, %cst675_i32_57 : i32
%cst3_i32_58 = spv.Constant 3 : i32
%146 = spv.IMul %95, %cst3_i32_58 : i32
%147 = spv.IAdd %145, %146 : i32
%cst675_i32_59 = spv.Constant 675 : i32
%148 = spv.IMul %21, %cst675_i32_59 : i32
%149 = spv.IAdd %147, %148 : i32
%cst675_i32_60 = spv.Constant 675 : i32
%150 = spv.IMul %17, %cst675_i32_60 : i32
%151 = spv.IAdd %149, %150 : i32
%cst3_i32_61 = spv.Constant 3 : i32
%152 = spv.IMul %8, %cst3_i32_61 : i32
%153 = spv.IAdd %151, %152 : i32
%cst3_i32_62 = spv.Constant 3 : i32
%154 = spv.IMul %18, %cst3_i32_62 : i32
%155 = spv.IAdd %153, %154 : i32
%cst1_i32_63 = spv.Constant 1 : i32
%156 = spv.IAdd %155, %cst1_i32_63 : i32
%cst0_i32_64 = spv.Constant 0 : i32
%cst0_i32_65 = spv.Constant 0 : i32
%cst1_i32_66 = spv.Constant 1 : i32
%157 = spv.IMul %cst1_i32_66, %156 : i32
%158 = spv.IAdd %cst0_i32_65, %157 : i32
%159 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_64, %158] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%160 = spv.Load "StorageBuffer" %159 : f32
%161 = spv.CompositeInsert %160, %144[1 : i32] : f32 into vector<3xf32>
%cst675_i32_67 = spv.Constant 675 : i32
%162 = spv.IMul %80, %cst675_i32_67 : i32
%cst3_i32_68 = spv.Constant 3 : i32
%163 = spv.IMul %95, %cst3_i32_68 : i32
%164 = spv.IAdd %162, %163 : i32
%cst675_i32_69 = spv.Constant 675 : i32
%165 = spv.IMul %21, %cst675_i32_69 : i32
%166 = spv.IAdd %164, %165 : i32
%cst675_i32_70 = spv.Constant 675 : i32
%167 = spv.IMul %17, %cst675_i32_70 : i32
%168 = spv.IAdd %166, %167 : i32
%cst3_i32_71 = spv.Constant 3 : i32
%169 = spv.IMul %8, %cst3_i32_71 : i32
%170 = spv.IAdd %168, %169 : i32
%cst3_i32_72 = spv.Constant 3 : i32
%171 = spv.IMul %18, %cst3_i32_72 : i32
%172 = spv.IAdd %170, %171 : i32
%cst2_i32_73 = spv.Constant 2 : i32
%173 = spv.IAdd %172, %cst2_i32_73 : i32
%cst0_i32_74 = spv.Constant 0 : i32
%cst0_i32_75 = spv.Constant 0 : i32
%cst1_i32_76 = spv.Constant 1 : i32
%174 = spv.IMul %cst1_i32_76, %173 : i32
%175 = spv.IAdd %cst0_i32_75, %174 : i32
%176 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_74, %175] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeInsert %177, %161[2 : i32] : f32 into vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %110, %96 : vector<4xf32>
%182 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%183 = spv.CompositeConstruct %182, %182, %182, %182 : vector<4xf32>
%184 = spv.GLSL.Fma %183, %119, %181 : vector<4xf32>
%185 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%186 = spv.CompositeConstruct %185, %185, %185, %185 : vector<4xf32>
%187 = spv.GLSL.Fma %186, %128, %184 : vector<4xf32>
%cst675_i32_77 = spv.Constant 675 : i32
%188 = spv.IMul %80, %cst675_i32_77 : i32
%cst675_i32_78 = spv.Constant 675 : i32
%189 = spv.IMul %21, %cst675_i32_78 : i32
%190 = spv.IAdd %188, %189 : i32
%cst675_i32_79 = spv.Constant 675 : i32
%191 = spv.IMul %17, %cst675_i32_79 : i32
%192 = spv.IAdd %190, %191 : i32
%cst3_i32_80 = spv.Constant 3 : i32
%193 = spv.IMul %8, %cst3_i32_80 : i32
%194 = spv.IAdd %192, %193 : i32
%cst3_i32_81 = spv.Constant 3 : i32
%195 = spv.IMul %18, %cst3_i32_81 : i32
%196 = spv.IAdd %194, %195 : i32
%cst3_i32_82 = spv.Constant 3 : i32
%197 = spv.IMul %95, %cst3_i32_82 : i32
%198 = spv.IAdd %196, %197 : i32
%cst6_i32 = spv.Constant 6 : i32
%199 = spv.IAdd %198, %cst6_i32 : i32
%cst0_i32_83 = spv.Constant 0 : i32
%cst0_i32_84 = spv.Constant 0 : i32
%cst1_i32_85 = spv.Constant 1 : i32
%200 = spv.IMul %cst1_i32_85, %199 : i32
%201 = spv.IAdd %cst0_i32_84, %200 : i32
%202 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_83, %201] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeInsert %203, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_86 = spv.Constant 675 : i32
%205 = spv.IMul %80, %cst675_i32_86 : i32
%cst675_i32_87 = spv.Constant 675 : i32
%206 = spv.IMul %21, %cst675_i32_87 : i32
%207 = spv.IAdd %205, %206 : i32
%cst675_i32_88 = spv.Constant 675 : i32
%208 = spv.IMul %17, %cst675_i32_88 : i32
%209 = spv.IAdd %207, %208 : i32
%cst3_i32_89 = spv.Constant 3 : i32
%210 = spv.IMul %8, %cst3_i32_89 : i32
%211 = spv.IAdd %209, %210 : i32
%cst3_i32_90 = spv.Constant 3 : i32
%212 = spv.IMul %18, %cst3_i32_90 : i32
%213 = spv.IAdd %211, %212 : i32
%cst3_i32_91 = spv.Constant 3 : i32
%214 = spv.IMul %95, %cst3_i32_91 : i32
%215 = spv.IAdd %213, %214 : i32
%cst7_i32 = spv.Constant 7 : i32
%216 = spv.IAdd %215, %cst7_i32 : i32
%cst0_i32_92 = spv.Constant 0 : i32
%cst0_i32_93 = spv.Constant 0 : i32
%cst1_i32_94 = spv.Constant 1 : i32
%217 = spv.IMul %cst1_i32_94, %216 : i32
%218 = spv.IAdd %cst0_i32_93, %217 : i32
%219 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_92, %218] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%220 = spv.Load "StorageBuffer" %219 : f32
%221 = spv.CompositeInsert %220, %204[1 : i32] : f32 into vector<3xf32>
%cst675_i32_95 = spv.Constant 675 : i32
%222 = spv.IMul %80, %cst675_i32_95 : i32
%cst675_i32_96 = spv.Constant 675 : i32
%223 = spv.IMul %21, %cst675_i32_96 : i32
%224 = spv.IAdd %222, %223 : i32
%cst675_i32_97 = spv.Constant 675 : i32
%225 = spv.IMul %17, %cst675_i32_97 : i32
%226 = spv.IAdd %224, %225 : i32
%cst3_i32_98 = spv.Constant 3 : i32
%227 = spv.IMul %8, %cst3_i32_98 : i32
%228 = spv.IAdd %226, %227 : i32
%cst3_i32_99 = spv.Constant 3 : i32
%229 = spv.IMul %18, %cst3_i32_99 : i32
%230 = spv.IAdd %228, %229 : i32
%cst3_i32_100 = spv.Constant 3 : i32
%231 = spv.IMul %95, %cst3_i32_100 : i32
%232 = spv.IAdd %230, %231 : i32
%cst8_i32_101 = spv.Constant 8 : i32
%233 = spv.IAdd %232, %cst8_i32_101 : i32
%cst0_i32_102 = spv.Constant 0 : i32
%cst0_i32_103 = spv.Constant 0 : i32
%cst1_i32_104 = spv.Constant 1 : i32
%234 = spv.IMul %cst1_i32_104, %233 : i32
%235 = spv.IAdd %cst0_i32_103, %234 : i32
%236 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_102, %235] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%237 = spv.Load "StorageBuffer" %236 : f32
%238 = spv.CompositeInsert %237, %221[2 : i32] : f32 into vector<3xf32>
%239 = spv.CompositeExtract %238[0 : i32] : vector<3xf32>
%240 = spv.CompositeConstruct %239, %239, %239, %239 : vector<4xf32>
%241 = spv.GLSL.Fma %240, %110, %97 : vector<4xf32>
%242 = spv.CompositeExtract %238[1 : i32] : vector<3xf32>
%243 = spv.CompositeConstruct %242, %242, %242, %242 : vector<4xf32>
%244 = spv.GLSL.Fma %243, %119, %241 : vector<4xf32>
%245 = spv.CompositeExtract %238[2 : i32] : vector<3xf32>
%246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
%247 = spv.GLSL.Fma %246, %128, %244 : vector<4xf32>
%cst675_i32_105 = spv.Constant 675 : i32
%248 = spv.IMul %80, %cst675_i32_105 : i32
%cst675_i32_106 = spv.Constant 675 : i32
%249 = spv.IMul %21, %cst675_i32_106 : i32
%250 = spv.IAdd %248, %249 : i32
%cst675_i32_107 = spv.Constant 675 : i32
%251 = spv.IMul %17, %cst675_i32_107 : i32
%252 = spv.IAdd %250, %251 : i32
%cst3_i32_108 = spv.Constant 3 : i32
%253 = spv.IMul %8, %cst3_i32_108 : i32
%254 = spv.IAdd %252, %253 : i32
%cst3_i32_109 = spv.Constant 3 : i32
%255 = spv.IMul %18, %cst3_i32_109 : i32
%256 = spv.IAdd %254, %255 : i32
%cst3_i32_110 = spv.Constant 3 : i32
%257 = spv.IMul %95, %cst3_i32_110 : i32
%258 = spv.IAdd %256, %257 : i32
%cst12_i32 = spv.Constant 12 : i32
%259 = spv.IAdd %258, %cst12_i32 : i32
%cst0_i32_111 = spv.Constant 0 : i32
%cst0_i32_112 = spv.Constant 0 : i32
%cst1_i32_113 = spv.Constant 1 : i32
%260 = spv.IMul %cst1_i32_113, %259 : i32
%261 = spv.IAdd %cst0_i32_112, %260 : i32
%262 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_111, %261] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%263 = spv.Load "StorageBuffer" %262 : f32
%264 = spv.CompositeInsert %263, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_114 = spv.Constant 675 : i32
%265 = spv.IMul %80, %cst675_i32_114 : i32
%cst675_i32_115 = spv.Constant 675 : i32
%266 = spv.IMul %21, %cst675_i32_115 : i32
%267 = spv.IAdd %265, %266 : i32
%cst675_i32_116 = spv.Constant 675 : i32
%268 = spv.IMul %17, %cst675_i32_116 : i32
%269 = spv.IAdd %267, %268 : i32
%cst3_i32_117 = spv.Constant 3 : i32
%270 = spv.IMul %8, %cst3_i32_117 : i32
%271 = spv.IAdd %269, %270 : i32
%cst3_i32_118 = spv.Constant 3 : i32
%272 = spv.IMul %18, %cst3_i32_118 : i32
%273 = spv.IAdd %271, %272 : i32
%cst3_i32_119 = spv.Constant 3 : i32
%274 = spv.IMul %95, %cst3_i32_119 : i32
%275 = spv.IAdd %273, %274 : i32
%cst13_i32 = spv.Constant 13 : i32
%276 = spv.IAdd %275, %cst13_i32 : i32
%cst0_i32_120 = spv.Constant 0 : i32
%cst0_i32_121 = spv.Constant 0 : i32
%cst1_i32_122 = spv.Constant 1 : i32
%277 = spv.IMul %cst1_i32_122, %276 : i32
%278 = spv.IAdd %cst0_i32_121, %277 : i32
%279 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_120, %278] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%280 = spv.Load "StorageBuffer" %279 : f32
%281 = spv.CompositeInsert %280, %264[1 : i32] : f32 into vector<3xf32>
%cst675_i32_123 = spv.Constant 675 : i32
%282 = spv.IMul %80, %cst675_i32_123 : i32
%cst675_i32_124 = spv.Constant 675 : i32
%283 = spv.IMul %21, %cst675_i32_124 : i32
%284 = spv.IAdd %282, %283 : i32
%cst675_i32_125 = spv.Constant 675 : i32
%285 = spv.IMul %17, %cst675_i32_125 : i32
%286 = spv.IAdd %284, %285 : i32
%cst3_i32_126 = spv.Constant 3 : i32
%287 = spv.IMul %8, %cst3_i32_126 : i32
%288 = spv.IAdd %286, %287 : i32
%cst3_i32_127 = spv.Constant 3 : i32
%289 = spv.IMul %18, %cst3_i32_127 : i32
%290 = spv.IAdd %288, %289 : i32
%cst3_i32_128 = spv.Constant 3 : i32
%291 = spv.IMul %95, %cst3_i32_128 : i32
%292 = spv.IAdd %290, %291 : i32
%cst14_i32 = spv.Constant 14 : i32
%293 = spv.IAdd %292, %cst14_i32 : i32
%cst0_i32_129 = spv.Constant 0 : i32
%cst0_i32_130 = spv.Constant 0 : i32
%cst1_i32_131 = spv.Constant 1 : i32
%294 = spv.IMul %cst1_i32_131, %293 : i32
%295 = spv.IAdd %cst0_i32_130, %294 : i32
%296 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_129, %295] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%297 = spv.Load "StorageBuffer" %296 : f32
%298 = spv.CompositeInsert %297, %281[2 : i32] : f32 into vector<3xf32>
%299 = spv.CompositeExtract %298[0 : i32] : vector<3xf32>
%300 = spv.CompositeConstruct %299, %299, %299, %299 : vector<4xf32>
%301 = spv.GLSL.Fma %300, %110, %98 : vector<4xf32>
%302 = spv.CompositeExtract %298[1 : i32] : vector<3xf32>
%303 = spv.CompositeConstruct %302, %302, %302, %302 : vector<4xf32>
%304 = spv.GLSL.Fma %303, %119, %301 : vector<4xf32>
%305 = spv.CompositeExtract %298[2 : i32] : vector<3xf32>
%306 = spv.CompositeConstruct %305, %305, %305, %305 : vector<4xf32>
%307 = spv.GLSL.Fma %306, %128, %304 : vector<4xf32>
%cst675_i32_132 = spv.Constant 675 : i32
%308 = spv.IMul %80, %cst675_i32_132 : i32
%cst675_i32_133 = spv.Constant 675 : i32
%309 = spv.IMul %21, %cst675_i32_133 : i32
%310 = spv.IAdd %308, %309 : i32
%cst675_i32_134 = spv.Constant 675 : i32
%311 = spv.IMul %17, %cst675_i32_134 : i32
%312 = spv.IAdd %310, %311 : i32
%cst3_i32_135 = spv.Constant 3 : i32
%313 = spv.IMul %8, %cst3_i32_135 : i32
%314 = spv.IAdd %312, %313 : i32
%cst3_i32_136 = spv.Constant 3 : i32
%315 = spv.IMul %18, %cst3_i32_136 : i32
%316 = spv.IAdd %314, %315 : i32
%cst3_i32_137 = spv.Constant 3 : i32
%317 = spv.IMul %95, %cst3_i32_137 : i32
%318 = spv.IAdd %316, %317 : i32
%cst18_i32 = spv.Constant 18 : i32
%319 = spv.IAdd %318, %cst18_i32 : i32
%cst0_i32_138 = spv.Constant 0 : i32
%cst0_i32_139 = spv.Constant 0 : i32
%cst1_i32_140 = spv.Constant 1 : i32
%320 = spv.IMul %cst1_i32_140, %319 : i32
%321 = spv.IAdd %cst0_i32_139, %320 : i32
%322 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_138, %321] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%323 = spv.Load "StorageBuffer" %322 : f32
%324 = spv.CompositeInsert %323, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_141 = spv.Constant 675 : i32
%325 = spv.IMul %80, %cst675_i32_141 : i32
%cst675_i32_142 = spv.Constant 675 : i32
%326 = spv.IMul %21, %cst675_i32_142 : i32
%327 = spv.IAdd %325, %326 : i32
%cst675_i32_143 = spv.Constant 675 : i32
%328 = spv.IMul %17, %cst675_i32_143 : i32
%329 = spv.IAdd %327, %328 : i32
%cst3_i32_144 = spv.Constant 3 : i32
%330 = spv.IMul %8, %cst3_i32_144 : i32
%331 = spv.IAdd %329, %330 : i32
%cst3_i32_145 = spv.Constant 3 : i32
%332 = spv.IMul %18, %cst3_i32_145 : i32
%333 = spv.IAdd %331, %332 : i32
%cst3_i32_146 = spv.Constant 3 : i32
%334 = spv.IMul %95, %cst3_i32_146 : i32
%335 = spv.IAdd %333, %334 : i32
%cst19_i32 = spv.Constant 19 : i32
%336 = spv.IAdd %335, %cst19_i32 : i32
%cst0_i32_147 = spv.Constant 0 : i32
%cst0_i32_148 = spv.Constant 0 : i32
%cst1_i32_149 = spv.Constant 1 : i32
%337 = spv.IMul %cst1_i32_149, %336 : i32
%338 = spv.IAdd %cst0_i32_148, %337 : i32
%339 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_147, %338] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%340 = spv.Load "StorageBuffer" %339 : f32
%341 = spv.CompositeInsert %340, %324[1 : i32] : f32 into vector<3xf32>
%cst675_i32_150 = spv.Constant 675 : i32
%342 = spv.IMul %80, %cst675_i32_150 : i32
%cst675_i32_151 = spv.Constant 675 : i32
%343 = spv.IMul %21, %cst675_i32_151 : i32
%344 = spv.IAdd %342, %343 : i32
%cst675_i32_152 = spv.Constant 675 : i32
%345 = spv.IMul %17, %cst675_i32_152 : i32
%346 = spv.IAdd %344, %345 : i32
%cst3_i32_153 = spv.Constant 3 : i32
%347 = spv.IMul %8, %cst3_i32_153 : i32
%348 = spv.IAdd %346, %347 : i32
%cst3_i32_154 = spv.Constant 3 : i32
%349 = spv.IMul %18, %cst3_i32_154 : i32
%350 = spv.IAdd %348, %349 : i32
%cst3_i32_155 = spv.Constant 3 : i32
%351 = spv.IMul %95, %cst3_i32_155 : i32
%352 = spv.IAdd %350, %351 : i32
%cst20_i32 = spv.Constant 20 : i32
%353 = spv.IAdd %352, %cst20_i32 : i32
%cst0_i32_156 = spv.Constant 0 : i32
%cst0_i32_157 = spv.Constant 0 : i32
%cst1_i32_158 = spv.Constant 1 : i32
%354 = spv.IMul %cst1_i32_158, %353 : i32
%355 = spv.IAdd %cst0_i32_157, %354 : i32
%356 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_156, %355] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%357 = spv.Load "StorageBuffer" %356 : f32
%358 = spv.CompositeInsert %357, %341[2 : i32] : f32 into vector<3xf32>
%359 = spv.CompositeExtract %358[0 : i32] : vector<3xf32>
%360 = spv.CompositeConstruct %359, %359, %359, %359 : vector<4xf32>
%361 = spv.GLSL.Fma %360, %110, %99 : vector<4xf32>
%362 = spv.CompositeExtract %358[1 : i32] : vector<3xf32>
%363 = spv.CompositeConstruct %362, %362, %362, %362 : vector<4xf32>
%364 = spv.GLSL.Fma %363, %119, %361 : vector<4xf32>
%365 = spv.CompositeExtract %358[2 : i32] : vector<3xf32>
%366 = spv.CompositeConstruct %365, %365, %365, %365 : vector<4xf32>
%367 = spv.GLSL.Fma %366, %128, %364 : vector<4xf32>
spv.Store "Function" %86, %187 : vector<4xf32>
spv.Store "Function" %87, %247 : vector<4xf32>
spv.Store "Function" %88, %307 : vector<4xf32>
spv.Store "Function" %89, %367 : vector<4xf32>
%368 = spv.IAdd %95, %cst1_i32 : i32
spv.Branch ^bb1(%368, %187, %247, %307, %367 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%90 = spv.Load "Function" %89 : vector<4xf32>
%91 = spv.Load "Function" %88 : vector<4xf32>
%92 = spv.Load "Function" %87 : vector<4xf32>
%93 = spv.Load "Function" %86 : vector<4xf32>
spv.Store "Function" %22, %93 : vector<4xf32>
spv.Store "Function" %23, %92 : vector<4xf32>
spv.Store "Function" %24, %91 : vector<4xf32>
spv.Store "Function" %25, %90 : vector<4xf32>
%94 = spv.IAdd %80, %cst1_i32 : i32
spv.Branch ^bb1(%94, %93, %92, %91, %90 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%cst896_i32 = spv.Constant 896 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%cst896_i32_4 = spv.Constant 896 : i32
%34 = spv.IMul %14, %cst896_i32_4 : i32
%35 = spv.IAdd %33, %34 : i32
%cst8_i32_5 = spv.Constant 8 : i32
%36 = spv.IMul %6, %cst8_i32_5 : i32
%37 = spv.IAdd %35, %36 : i32
%cst8_i32_6 = spv.Constant 8 : i32
%38 = spv.IMul %15, %cst8_i32_6 : i32
%39 = spv.IAdd %37, %38 : i32
%cst24_i32 = spv.Constant 24 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%cst0_i32_7 = spv.Constant 0 : i32
%cst0_i32_8 = spv.Constant 0 : i32
%cst1_i32_9 = spv.Constant 1 : i32
%41 = spv.IMul %cst1_i32_9, %40 : i32
%42 = spv.IAdd %cst0_i32_8, %41 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_7, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %26 : vector<4xf32>
%cst896_i32_10 = spv.Constant 896 : i32
%44 = spv.IMul %19, %cst896_i32_10 : i32
%45 = spv.IAdd %31, %44 : i32
%cst896_i32_11 = spv.Constant 896 : i32
%46 = spv.IMul %14, %cst896_i32_11 : i32
%47 = spv.IAdd %45, %46 : i32
%cst8_i32_12 = spv.Constant 8 : i32
%48 = spv.IMul %6, %cst8_i32_12 : i32
%49 = spv.IAdd %47, %48 : i32
%cst8_i32_13 = spv.Constant 8 : i32
%50 = spv.IMul %15, %cst8_i32_13 : i32
%51 = spv.IAdd %49, %50 : i32
%cst16_i32_14 = spv.Constant 16 : i32
%52 = spv.IAdd %51, %cst16_i32_14 : i32
%cst0_i32_15 = spv.Constant 0 : i32
%cst0_i32_16 = spv.Constant 0 : i32
%cst1_i32_17 = spv.Constant 1 : i32
%53 = spv.IMul %cst1_i32_17, %52 : i32
%54 = spv.IAdd %cst0_i32_16, %53 : i32
%55 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_15, %54] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %55, %27 : vector<4xf32>
%cst896_i32_18 = spv.Constant 896 : i32
%56 = spv.IMul %19, %cst896_i32_18 : i32
%57 = spv.IAdd %31, %56 : i32
%cst896_i32_19 = spv.Constant 896 : i32
%58 = spv.IMul %14, %cst896_i32_19 : i32
%59 = spv.IAdd %57, %58 : i32
%cst8_i32_20 = spv.Constant 8 : i32
%60 = spv.IMul %6, %cst8_i32_20 : i32
%61 = spv.IAdd %59, %60 : i32
%cst8_i32_21 = spv.Constant 8 : i32
%62 = spv.IMul %15, %cst8_i32_21 : i32
%63 = spv.IAdd %61, %62 : i32
%cst8_i32_22 = spv.Constant 8 : i32
%64 = spv.IAdd %63, %cst8_i32_22 : i32
%cst0_i32_23 = spv.Constant 0 : i32
%cst0_i32_24 = spv.Constant 0 : i32
%cst1_i32_25 = spv.Constant 1 : i32
%65 = spv.IMul %cst1_i32_25, %64 : i32
%66 = spv.IAdd %cst0_i32_24, %65 : i32
%67 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_23, %66] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %67, %28 : vector<4xf32>
%cst896_i32_26 = spv.Constant 896 : i32
%68 = spv.IMul %19, %cst896_i32_26 : i32
%69 = spv.IAdd %31, %68 : i32
%cst896_i32_27 = spv.Constant 896 : i32
%70 = spv.IMul %14, %cst896_i32_27 : i32
%71 = spv.IAdd %69, %70 : i32
%cst8_i32_28 = spv.Constant 8 : i32
%72 = spv.IMul %6, %cst8_i32_28 : i32
%73 = spv.IAdd %71, %72 : i32
%cst8_i32_29 = spv.Constant 8 : i32
%74 = spv.IMul %15, %cst8_i32_29 : i32
%75 = spv.IAdd %73, %74 : i32
%cst0_i32_30 = spv.Constant 0 : i32
%cst0_i32_31 = spv.Constant 0 : i32
%cst1_i32_32 = spv.Constant 1 : i32
%76 = spv.IMul %cst1_i32_32, %75 : i32
%77 = spv.IAdd %cst0_i32_31, %76 : i32
%78 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_30, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %78, %29 : vector<4xf32>
%79 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%79 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After SPIRVLowerABIAttributes //----- //
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst100352_i32 = spv.Constant 100352 : i32
%cst216_i32 = spv.Constant 216 : i32
%cst151875_i32 = spv.Constant 151875 : i32
%cst1_i32 = spv.Constant 1 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst8_i32 = spv.Constant 8 : i32
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_0 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_1 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_1 : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%__builtin_var_LocalInvocationId___addr_2 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_2 : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%__builtin_var_LocalInvocationId___addr_3 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_3 : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%80: i32, %81: vector<4xf32>, %82: vector<4xf32>, %83: vector<4xf32>, %84: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%85 = spv.SLessThan %80, %cst3_i32 : i32
spv.BranchConditional %85, ^bb2, ^bb3
^bb2: // pred: ^bb1
%86 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%87 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%88 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%89 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %81, %82, %83, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%95: i32, %96: vector<4xf32>, %97: vector<4xf32>, %98: vector<4xf32>, %99: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%100 = spv.SLessThan %95, %cst3_i32 : i32
spv.BranchConditional %100, ^bb2, ^bb3
^bb2: // pred: ^bb1
%101 = spv.IAdd %7, %16 : i32
%102 = spv.SDiv %101, %cst4_i32 : i32
%cst72_i32 = spv.Constant 72 : i32
%103 = spv.IMul %80, %cst72_i32 : i32
%cst24_i32_33 = spv.Constant 24 : i32
%104 = spv.IMul %95, %cst24_i32_33 : i32
%105 = spv.IAdd %103, %104 : i32
%106 = spv.IAdd %105, %102 : i32
%cst0_i32_34 = spv.Constant 0 : i32
%cst0_i32_35 = spv.Constant 0 : i32
%cst1_i32_36 = spv.Constant 1 : i32
%107 = spv.IMul %cst1_i32_36, %106 : i32
%108 = spv.IAdd %cst0_i32_35, %107 : i32
%109 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_34, %108] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%cst72_i32_37 = spv.Constant 72 : i32
%111 = spv.IMul %80, %cst72_i32_37 : i32
%cst24_i32_38 = spv.Constant 24 : i32
%112 = spv.IMul %95, %cst24_i32_38 : i32
%113 = spv.IAdd %111, %112 : i32
%114 = spv.IAdd %113, %102 : i32
%cst8_i32_39 = spv.Constant 8 : i32
%115 = spv.IAdd %114, %cst8_i32_39 : i32
%cst0_i32_40 = spv.Constant 0 : i32
%cst0_i32_41 = spv.Constant 0 : i32
%cst1_i32_42 = spv.Constant 1 : i32
%116 = spv.IMul %cst1_i32_42, %115 : i32
%117 = spv.IAdd %cst0_i32_41, %116 : i32
%118 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_40, %117] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : vector<4xf32>
%cst72_i32_43 = spv.Constant 72 : i32
%120 = spv.IMul %80, %cst72_i32_43 : i32
%cst24_i32_44 = spv.Constant 24 : i32
%121 = spv.IMul %95, %cst24_i32_44 : i32
%122 = spv.IAdd %120, %121 : i32
%123 = spv.IAdd %122, %102 : i32
%cst16_i32_45 = spv.Constant 16 : i32
%124 = spv.IAdd %123, %cst16_i32_45 : i32
%cst0_i32_46 = spv.Constant 0 : i32
%cst0_i32_47 = spv.Constant 0 : i32
%cst1_i32_48 = spv.Constant 1 : i32
%125 = spv.IMul %cst1_i32_48, %124 : i32
%126 = spv.IAdd %cst0_i32_47, %125 : i32
%127 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32_46, %126] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%128 = spv.Load "StorageBuffer" %127 : vector<4xf32>
%cst675_i32 = spv.Constant 675 : i32
%129 = spv.IMul %80, %cst675_i32 : i32
%cst3_i32_49 = spv.Constant 3 : i32
%130 = spv.IMul %95, %cst3_i32_49 : i32
%131 = spv.IAdd %129, %130 : i32
%cst675_i32_50 = spv.Constant 675 : i32
%132 = spv.IMul %21, %cst675_i32_50 : i32
%133 = spv.IAdd %131, %132 : i32
%cst675_i32_51 = spv.Constant 675 : i32
%134 = spv.IMul %17, %cst675_i32_51 : i32
%135 = spv.IAdd %133, %134 : i32
%cst3_i32_52 = spv.Constant 3 : i32
%136 = spv.IMul %8, %cst3_i32_52 : i32
%137 = spv.IAdd %135, %136 : i32
%cst3_i32_53 = spv.Constant 3 : i32
%138 = spv.IMul %18, %cst3_i32_53 : i32
%139 = spv.IAdd %137, %138 : i32
%cst0_i32_54 = spv.Constant 0 : i32
%cst0_i32_55 = spv.Constant 0 : i32
%cst1_i32_56 = spv.Constant 1 : i32
%140 = spv.IMul %cst1_i32_56, %139 : i32
%141 = spv.IAdd %cst0_i32_55, %140 : i32
%142 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_54, %141] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%143 = spv.Load "StorageBuffer" %142 : f32
%144 = spv.CompositeInsert %143, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_57 = spv.Constant 675 : i32
%145 = spv.IMul %80, %cst675_i32_57 : i32
%cst3_i32_58 = spv.Constant 3 : i32
%146 = spv.IMul %95, %cst3_i32_58 : i32
%147 = spv.IAdd %145, %146 : i32
%cst675_i32_59 = spv.Constant 675 : i32
%148 = spv.IMul %21, %cst675_i32_59 : i32
%149 = spv.IAdd %147, %148 : i32
%cst675_i32_60 = spv.Constant 675 : i32
%150 = spv.IMul %17, %cst675_i32_60 : i32
%151 = spv.IAdd %149, %150 : i32
%cst3_i32_61 = spv.Constant 3 : i32
%152 = spv.IMul %8, %cst3_i32_61 : i32
%153 = spv.IAdd %151, %152 : i32
%cst3_i32_62 = spv.Constant 3 : i32
%154 = spv.IMul %18, %cst3_i32_62 : i32
%155 = spv.IAdd %153, %154 : i32
%cst1_i32_63 = spv.Constant 1 : i32
%156 = spv.IAdd %155, %cst1_i32_63 : i32
%cst0_i32_64 = spv.Constant 0 : i32
%cst0_i32_65 = spv.Constant 0 : i32
%cst1_i32_66 = spv.Constant 1 : i32
%157 = spv.IMul %cst1_i32_66, %156 : i32
%158 = spv.IAdd %cst0_i32_65, %157 : i32
%159 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_64, %158] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%160 = spv.Load "StorageBuffer" %159 : f32
%161 = spv.CompositeInsert %160, %144[1 : i32] : f32 into vector<3xf32>
%cst675_i32_67 = spv.Constant 675 : i32
%162 = spv.IMul %80, %cst675_i32_67 : i32
%cst3_i32_68 = spv.Constant 3 : i32
%163 = spv.IMul %95, %cst3_i32_68 : i32
%164 = spv.IAdd %162, %163 : i32
%cst675_i32_69 = spv.Constant 675 : i32
%165 = spv.IMul %21, %cst675_i32_69 : i32
%166 = spv.IAdd %164, %165 : i32
%cst675_i32_70 = spv.Constant 675 : i32
%167 = spv.IMul %17, %cst675_i32_70 : i32
%168 = spv.IAdd %166, %167 : i32
%cst3_i32_71 = spv.Constant 3 : i32
%169 = spv.IMul %8, %cst3_i32_71 : i32
%170 = spv.IAdd %168, %169 : i32
%cst3_i32_72 = spv.Constant 3 : i32
%171 = spv.IMul %18, %cst3_i32_72 : i32
%172 = spv.IAdd %170, %171 : i32
%cst2_i32_73 = spv.Constant 2 : i32
%173 = spv.IAdd %172, %cst2_i32_73 : i32
%cst0_i32_74 = spv.Constant 0 : i32
%cst0_i32_75 = spv.Constant 0 : i32
%cst1_i32_76 = spv.Constant 1 : i32
%174 = spv.IMul %cst1_i32_76, %173 : i32
%175 = spv.IAdd %cst0_i32_75, %174 : i32
%176 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_74, %175] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeInsert %177, %161[2 : i32] : f32 into vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %110, %96 : vector<4xf32>
%182 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%183 = spv.CompositeConstruct %182, %182, %182, %182 : vector<4xf32>
%184 = spv.GLSL.Fma %183, %119, %181 : vector<4xf32>
%185 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%186 = spv.CompositeConstruct %185, %185, %185, %185 : vector<4xf32>
%187 = spv.GLSL.Fma %186, %128, %184 : vector<4xf32>
%cst675_i32_77 = spv.Constant 675 : i32
%188 = spv.IMul %80, %cst675_i32_77 : i32
%cst675_i32_78 = spv.Constant 675 : i32
%189 = spv.IMul %21, %cst675_i32_78 : i32
%190 = spv.IAdd %188, %189 : i32
%cst675_i32_79 = spv.Constant 675 : i32
%191 = spv.IMul %17, %cst675_i32_79 : i32
%192 = spv.IAdd %190, %191 : i32
%cst3_i32_80 = spv.Constant 3 : i32
%193 = spv.IMul %8, %cst3_i32_80 : i32
%194 = spv.IAdd %192, %193 : i32
%cst3_i32_81 = spv.Constant 3 : i32
%195 = spv.IMul %18, %cst3_i32_81 : i32
%196 = spv.IAdd %194, %195 : i32
%cst3_i32_82 = spv.Constant 3 : i32
%197 = spv.IMul %95, %cst3_i32_82 : i32
%198 = spv.IAdd %196, %197 : i32
%cst6_i32 = spv.Constant 6 : i32
%199 = spv.IAdd %198, %cst6_i32 : i32
%cst0_i32_83 = spv.Constant 0 : i32
%cst0_i32_84 = spv.Constant 0 : i32
%cst1_i32_85 = spv.Constant 1 : i32
%200 = spv.IMul %cst1_i32_85, %199 : i32
%201 = spv.IAdd %cst0_i32_84, %200 : i32
%202 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_83, %201] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeInsert %203, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_86 = spv.Constant 675 : i32
%205 = spv.IMul %80, %cst675_i32_86 : i32
%cst675_i32_87 = spv.Constant 675 : i32
%206 = spv.IMul %21, %cst675_i32_87 : i32
%207 = spv.IAdd %205, %206 : i32
%cst675_i32_88 = spv.Constant 675 : i32
%208 = spv.IMul %17, %cst675_i32_88 : i32
%209 = spv.IAdd %207, %208 : i32
%cst3_i32_89 = spv.Constant 3 : i32
%210 = spv.IMul %8, %cst3_i32_89 : i32
%211 = spv.IAdd %209, %210 : i32
%cst3_i32_90 = spv.Constant 3 : i32
%212 = spv.IMul %18, %cst3_i32_90 : i32
%213 = spv.IAdd %211, %212 : i32
%cst3_i32_91 = spv.Constant 3 : i32
%214 = spv.IMul %95, %cst3_i32_91 : i32
%215 = spv.IAdd %213, %214 : i32
%cst7_i32 = spv.Constant 7 : i32
%216 = spv.IAdd %215, %cst7_i32 : i32
%cst0_i32_92 = spv.Constant 0 : i32
%cst0_i32_93 = spv.Constant 0 : i32
%cst1_i32_94 = spv.Constant 1 : i32
%217 = spv.IMul %cst1_i32_94, %216 : i32
%218 = spv.IAdd %cst0_i32_93, %217 : i32
%219 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_92, %218] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%220 = spv.Load "StorageBuffer" %219 : f32
%221 = spv.CompositeInsert %220, %204[1 : i32] : f32 into vector<3xf32>
%cst675_i32_95 = spv.Constant 675 : i32
%222 = spv.IMul %80, %cst675_i32_95 : i32
%cst675_i32_96 = spv.Constant 675 : i32
%223 = spv.IMul %21, %cst675_i32_96 : i32
%224 = spv.IAdd %222, %223 : i32
%cst675_i32_97 = spv.Constant 675 : i32
%225 = spv.IMul %17, %cst675_i32_97 : i32
%226 = spv.IAdd %224, %225 : i32
%cst3_i32_98 = spv.Constant 3 : i32
%227 = spv.IMul %8, %cst3_i32_98 : i32
%228 = spv.IAdd %226, %227 : i32
%cst3_i32_99 = spv.Constant 3 : i32
%229 = spv.IMul %18, %cst3_i32_99 : i32
%230 = spv.IAdd %228, %229 : i32
%cst3_i32_100 = spv.Constant 3 : i32
%231 = spv.IMul %95, %cst3_i32_100 : i32
%232 = spv.IAdd %230, %231 : i32
%cst8_i32_101 = spv.Constant 8 : i32
%233 = spv.IAdd %232, %cst8_i32_101 : i32
%cst0_i32_102 = spv.Constant 0 : i32
%cst0_i32_103 = spv.Constant 0 : i32
%cst1_i32_104 = spv.Constant 1 : i32
%234 = spv.IMul %cst1_i32_104, %233 : i32
%235 = spv.IAdd %cst0_i32_103, %234 : i32
%236 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_102, %235] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%237 = spv.Load "StorageBuffer" %236 : f32
%238 = spv.CompositeInsert %237, %221[2 : i32] : f32 into vector<3xf32>
%239 = spv.CompositeExtract %238[0 : i32] : vector<3xf32>
%240 = spv.CompositeConstruct %239, %239, %239, %239 : vector<4xf32>
%241 = spv.GLSL.Fma %240, %110, %97 : vector<4xf32>
%242 = spv.CompositeExtract %238[1 : i32] : vector<3xf32>
%243 = spv.CompositeConstruct %242, %242, %242, %242 : vector<4xf32>
%244 = spv.GLSL.Fma %243, %119, %241 : vector<4xf32>
%245 = spv.CompositeExtract %238[2 : i32] : vector<3xf32>
%246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
%247 = spv.GLSL.Fma %246, %128, %244 : vector<4xf32>
%cst675_i32_105 = spv.Constant 675 : i32
%248 = spv.IMul %80, %cst675_i32_105 : i32
%cst675_i32_106 = spv.Constant 675 : i32
%249 = spv.IMul %21, %cst675_i32_106 : i32
%250 = spv.IAdd %248, %249 : i32
%cst675_i32_107 = spv.Constant 675 : i32
%251 = spv.IMul %17, %cst675_i32_107 : i32
%252 = spv.IAdd %250, %251 : i32
%cst3_i32_108 = spv.Constant 3 : i32
%253 = spv.IMul %8, %cst3_i32_108 : i32
%254 = spv.IAdd %252, %253 : i32
%cst3_i32_109 = spv.Constant 3 : i32
%255 = spv.IMul %18, %cst3_i32_109 : i32
%256 = spv.IAdd %254, %255 : i32
%cst3_i32_110 = spv.Constant 3 : i32
%257 = spv.IMul %95, %cst3_i32_110 : i32
%258 = spv.IAdd %256, %257 : i32
%cst12_i32 = spv.Constant 12 : i32
%259 = spv.IAdd %258, %cst12_i32 : i32
%cst0_i32_111 = spv.Constant 0 : i32
%cst0_i32_112 = spv.Constant 0 : i32
%cst1_i32_113 = spv.Constant 1 : i32
%260 = spv.IMul %cst1_i32_113, %259 : i32
%261 = spv.IAdd %cst0_i32_112, %260 : i32
%262 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_111, %261] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%263 = spv.Load "StorageBuffer" %262 : f32
%264 = spv.CompositeInsert %263, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_114 = spv.Constant 675 : i32
%265 = spv.IMul %80, %cst675_i32_114 : i32
%cst675_i32_115 = spv.Constant 675 : i32
%266 = spv.IMul %21, %cst675_i32_115 : i32
%267 = spv.IAdd %265, %266 : i32
%cst675_i32_116 = spv.Constant 675 : i32
%268 = spv.IMul %17, %cst675_i32_116 : i32
%269 = spv.IAdd %267, %268 : i32
%cst3_i32_117 = spv.Constant 3 : i32
%270 = spv.IMul %8, %cst3_i32_117 : i32
%271 = spv.IAdd %269, %270 : i32
%cst3_i32_118 = spv.Constant 3 : i32
%272 = spv.IMul %18, %cst3_i32_118 : i32
%273 = spv.IAdd %271, %272 : i32
%cst3_i32_119 = spv.Constant 3 : i32
%274 = spv.IMul %95, %cst3_i32_119 : i32
%275 = spv.IAdd %273, %274 : i32
%cst13_i32 = spv.Constant 13 : i32
%276 = spv.IAdd %275, %cst13_i32 : i32
%cst0_i32_120 = spv.Constant 0 : i32
%cst0_i32_121 = spv.Constant 0 : i32
%cst1_i32_122 = spv.Constant 1 : i32
%277 = spv.IMul %cst1_i32_122, %276 : i32
%278 = spv.IAdd %cst0_i32_121, %277 : i32
%279 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_120, %278] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%280 = spv.Load "StorageBuffer" %279 : f32
%281 = spv.CompositeInsert %280, %264[1 : i32] : f32 into vector<3xf32>
%cst675_i32_123 = spv.Constant 675 : i32
%282 = spv.IMul %80, %cst675_i32_123 : i32
%cst675_i32_124 = spv.Constant 675 : i32
%283 = spv.IMul %21, %cst675_i32_124 : i32
%284 = spv.IAdd %282, %283 : i32
%cst675_i32_125 = spv.Constant 675 : i32
%285 = spv.IMul %17, %cst675_i32_125 : i32
%286 = spv.IAdd %284, %285 : i32
%cst3_i32_126 = spv.Constant 3 : i32
%287 = spv.IMul %8, %cst3_i32_126 : i32
%288 = spv.IAdd %286, %287 : i32
%cst3_i32_127 = spv.Constant 3 : i32
%289 = spv.IMul %18, %cst3_i32_127 : i32
%290 = spv.IAdd %288, %289 : i32
%cst3_i32_128 = spv.Constant 3 : i32
%291 = spv.IMul %95, %cst3_i32_128 : i32
%292 = spv.IAdd %290, %291 : i32
%cst14_i32 = spv.Constant 14 : i32
%293 = spv.IAdd %292, %cst14_i32 : i32
%cst0_i32_129 = spv.Constant 0 : i32
%cst0_i32_130 = spv.Constant 0 : i32
%cst1_i32_131 = spv.Constant 1 : i32
%294 = spv.IMul %cst1_i32_131, %293 : i32
%295 = spv.IAdd %cst0_i32_130, %294 : i32
%296 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_129, %295] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%297 = spv.Load "StorageBuffer" %296 : f32
%298 = spv.CompositeInsert %297, %281[2 : i32] : f32 into vector<3xf32>
%299 = spv.CompositeExtract %298[0 : i32] : vector<3xf32>
%300 = spv.CompositeConstruct %299, %299, %299, %299 : vector<4xf32>
%301 = spv.GLSL.Fma %300, %110, %98 : vector<4xf32>
%302 = spv.CompositeExtract %298[1 : i32] : vector<3xf32>
%303 = spv.CompositeConstruct %302, %302, %302, %302 : vector<4xf32>
%304 = spv.GLSL.Fma %303, %119, %301 : vector<4xf32>
%305 = spv.CompositeExtract %298[2 : i32] : vector<3xf32>
%306 = spv.CompositeConstruct %305, %305, %305, %305 : vector<4xf32>
%307 = spv.GLSL.Fma %306, %128, %304 : vector<4xf32>
%cst675_i32_132 = spv.Constant 675 : i32
%308 = spv.IMul %80, %cst675_i32_132 : i32
%cst675_i32_133 = spv.Constant 675 : i32
%309 = spv.IMul %21, %cst675_i32_133 : i32
%310 = spv.IAdd %308, %309 : i32
%cst675_i32_134 = spv.Constant 675 : i32
%311 = spv.IMul %17, %cst675_i32_134 : i32
%312 = spv.IAdd %310, %311 : i32
%cst3_i32_135 = spv.Constant 3 : i32
%313 = spv.IMul %8, %cst3_i32_135 : i32
%314 = spv.IAdd %312, %313 : i32
%cst3_i32_136 = spv.Constant 3 : i32
%315 = spv.IMul %18, %cst3_i32_136 : i32
%316 = spv.IAdd %314, %315 : i32
%cst3_i32_137 = spv.Constant 3 : i32
%317 = spv.IMul %95, %cst3_i32_137 : i32
%318 = spv.IAdd %316, %317 : i32
%cst18_i32 = spv.Constant 18 : i32
%319 = spv.IAdd %318, %cst18_i32 : i32
%cst0_i32_138 = spv.Constant 0 : i32
%cst0_i32_139 = spv.Constant 0 : i32
%cst1_i32_140 = spv.Constant 1 : i32
%320 = spv.IMul %cst1_i32_140, %319 : i32
%321 = spv.IAdd %cst0_i32_139, %320 : i32
%322 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_138, %321] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%323 = spv.Load "StorageBuffer" %322 : f32
%324 = spv.CompositeInsert %323, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%cst675_i32_141 = spv.Constant 675 : i32
%325 = spv.IMul %80, %cst675_i32_141 : i32
%cst675_i32_142 = spv.Constant 675 : i32
%326 = spv.IMul %21, %cst675_i32_142 : i32
%327 = spv.IAdd %325, %326 : i32
%cst675_i32_143 = spv.Constant 675 : i32
%328 = spv.IMul %17, %cst675_i32_143 : i32
%329 = spv.IAdd %327, %328 : i32
%cst3_i32_144 = spv.Constant 3 : i32
%330 = spv.IMul %8, %cst3_i32_144 : i32
%331 = spv.IAdd %329, %330 : i32
%cst3_i32_145 = spv.Constant 3 : i32
%332 = spv.IMul %18, %cst3_i32_145 : i32
%333 = spv.IAdd %331, %332 : i32
%cst3_i32_146 = spv.Constant 3 : i32
%334 = spv.IMul %95, %cst3_i32_146 : i32
%335 = spv.IAdd %333, %334 : i32
%cst19_i32 = spv.Constant 19 : i32
%336 = spv.IAdd %335, %cst19_i32 : i32
%cst0_i32_147 = spv.Constant 0 : i32
%cst0_i32_148 = spv.Constant 0 : i32
%cst1_i32_149 = spv.Constant 1 : i32
%337 = spv.IMul %cst1_i32_149, %336 : i32
%338 = spv.IAdd %cst0_i32_148, %337 : i32
%339 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_147, %338] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%340 = spv.Load "StorageBuffer" %339 : f32
%341 = spv.CompositeInsert %340, %324[1 : i32] : f32 into vector<3xf32>
%cst675_i32_150 = spv.Constant 675 : i32
%342 = spv.IMul %80, %cst675_i32_150 : i32
%cst675_i32_151 = spv.Constant 675 : i32
%343 = spv.IMul %21, %cst675_i32_151 : i32
%344 = spv.IAdd %342, %343 : i32
%cst675_i32_152 = spv.Constant 675 : i32
%345 = spv.IMul %17, %cst675_i32_152 : i32
%346 = spv.IAdd %344, %345 : i32
%cst3_i32_153 = spv.Constant 3 : i32
%347 = spv.IMul %8, %cst3_i32_153 : i32
%348 = spv.IAdd %346, %347 : i32
%cst3_i32_154 = spv.Constant 3 : i32
%349 = spv.IMul %18, %cst3_i32_154 : i32
%350 = spv.IAdd %348, %349 : i32
%cst3_i32_155 = spv.Constant 3 : i32
%351 = spv.IMul %95, %cst3_i32_155 : i32
%352 = spv.IAdd %350, %351 : i32
%cst20_i32 = spv.Constant 20 : i32
%353 = spv.IAdd %352, %cst20_i32 : i32
%cst0_i32_156 = spv.Constant 0 : i32
%cst0_i32_157 = spv.Constant 0 : i32
%cst1_i32_158 = spv.Constant 1 : i32
%354 = spv.IMul %cst1_i32_158, %353 : i32
%355 = spv.IAdd %cst0_i32_157, %354 : i32
%356 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32_156, %355] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%357 = spv.Load "StorageBuffer" %356 : f32
%358 = spv.CompositeInsert %357, %341[2 : i32] : f32 into vector<3xf32>
%359 = spv.CompositeExtract %358[0 : i32] : vector<3xf32>
%360 = spv.CompositeConstruct %359, %359, %359, %359 : vector<4xf32>
%361 = spv.GLSL.Fma %360, %110, %99 : vector<4xf32>
%362 = spv.CompositeExtract %358[1 : i32] : vector<3xf32>
%363 = spv.CompositeConstruct %362, %362, %362, %362 : vector<4xf32>
%364 = spv.GLSL.Fma %363, %119, %361 : vector<4xf32>
%365 = spv.CompositeExtract %358[2 : i32] : vector<3xf32>
%366 = spv.CompositeConstruct %365, %365, %365, %365 : vector<4xf32>
%367 = spv.GLSL.Fma %366, %128, %364 : vector<4xf32>
spv.Store "Function" %86, %187 : vector<4xf32>
spv.Store "Function" %87, %247 : vector<4xf32>
spv.Store "Function" %88, %307 : vector<4xf32>
spv.Store "Function" %89, %367 : vector<4xf32>
%368 = spv.IAdd %95, %cst1_i32 : i32
spv.Branch ^bb1(%368, %187, %247, %307, %367 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%90 = spv.Load "Function" %89 : vector<4xf32>
%91 = spv.Load "Function" %88 : vector<4xf32>
%92 = spv.Load "Function" %87 : vector<4xf32>
%93 = spv.Load "Function" %86 : vector<4xf32>
spv.Store "Function" %22, %93 : vector<4xf32>
spv.Store "Function" %23, %92 : vector<4xf32>
spv.Store "Function" %24, %91 : vector<4xf32>
spv.Store "Function" %25, %90 : vector<4xf32>
%94 = spv.IAdd %80, %cst1_i32 : i32
spv.Branch ^bb1(%94, %93, %92, %91, %90 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%cst896_i32 = spv.Constant 896 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%cst896_i32_4 = spv.Constant 896 : i32
%34 = spv.IMul %14, %cst896_i32_4 : i32
%35 = spv.IAdd %33, %34 : i32
%cst8_i32_5 = spv.Constant 8 : i32
%36 = spv.IMul %6, %cst8_i32_5 : i32
%37 = spv.IAdd %35, %36 : i32
%cst8_i32_6 = spv.Constant 8 : i32
%38 = spv.IMul %15, %cst8_i32_6 : i32
%39 = spv.IAdd %37, %38 : i32
%cst24_i32 = spv.Constant 24 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%cst0_i32_7 = spv.Constant 0 : i32
%cst0_i32_8 = spv.Constant 0 : i32
%cst1_i32_9 = spv.Constant 1 : i32
%41 = spv.IMul %cst1_i32_9, %40 : i32
%42 = spv.IAdd %cst0_i32_8, %41 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_7, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %26 : vector<4xf32>
%cst896_i32_10 = spv.Constant 896 : i32
%44 = spv.IMul %19, %cst896_i32_10 : i32
%45 = spv.IAdd %31, %44 : i32
%cst896_i32_11 = spv.Constant 896 : i32
%46 = spv.IMul %14, %cst896_i32_11 : i32
%47 = spv.IAdd %45, %46 : i32
%cst8_i32_12 = spv.Constant 8 : i32
%48 = spv.IMul %6, %cst8_i32_12 : i32
%49 = spv.IAdd %47, %48 : i32
%cst8_i32_13 = spv.Constant 8 : i32
%50 = spv.IMul %15, %cst8_i32_13 : i32
%51 = spv.IAdd %49, %50 : i32
%cst16_i32_14 = spv.Constant 16 : i32
%52 = spv.IAdd %51, %cst16_i32_14 : i32
%cst0_i32_15 = spv.Constant 0 : i32
%cst0_i32_16 = spv.Constant 0 : i32
%cst1_i32_17 = spv.Constant 1 : i32
%53 = spv.IMul %cst1_i32_17, %52 : i32
%54 = spv.IAdd %cst0_i32_16, %53 : i32
%55 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_15, %54] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %55, %27 : vector<4xf32>
%cst896_i32_18 = spv.Constant 896 : i32
%56 = spv.IMul %19, %cst896_i32_18 : i32
%57 = spv.IAdd %31, %56 : i32
%cst896_i32_19 = spv.Constant 896 : i32
%58 = spv.IMul %14, %cst896_i32_19 : i32
%59 = spv.IAdd %57, %58 : i32
%cst8_i32_20 = spv.Constant 8 : i32
%60 = spv.IMul %6, %cst8_i32_20 : i32
%61 = spv.IAdd %59, %60 : i32
%cst8_i32_21 = spv.Constant 8 : i32
%62 = spv.IMul %15, %cst8_i32_21 : i32
%63 = spv.IAdd %61, %62 : i32
%cst8_i32_22 = spv.Constant 8 : i32
%64 = spv.IAdd %63, %cst8_i32_22 : i32
%cst0_i32_23 = spv.Constant 0 : i32
%cst0_i32_24 = spv.Constant 0 : i32
%cst1_i32_25 = spv.Constant 1 : i32
%65 = spv.IMul %cst1_i32_25, %64 : i32
%66 = spv.IAdd %cst0_i32_24, %65 : i32
%67 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_23, %66] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %67, %28 : vector<4xf32>
%cst896_i32_26 = spv.Constant 896 : i32
%68 = spv.IMul %19, %cst896_i32_26 : i32
%69 = spv.IAdd %31, %68 : i32
%cst896_i32_27 = spv.Constant 896 : i32
%70 = spv.IMul %14, %cst896_i32_27 : i32
%71 = spv.IAdd %69, %70 : i32
%cst8_i32_28 = spv.Constant 8 : i32
%72 = spv.IMul %6, %cst8_i32_28 : i32
%73 = spv.IAdd %71, %72 : i32
%cst8_i32_29 = spv.Constant 8 : i32
%74 = spv.IMul %15, %cst8_i32_29 : i32
%75 = spv.IAdd %73, %74 : i32
%cst0_i32_30 = spv.Constant 0 : i32
%cst0_i32_31 = spv.Constant 0 : i32
%cst1_i32_32 = spv.Constant 1 : i32
%76 = spv.IMul %cst1_i32_32, %75 : i32
%77 = spv.IAdd %cst0_i32_31, %76 : i32
%78 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32_30, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %78, %29 : vector<4xf32>
%79 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%79 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
// -----// IR Dump After Canonicalizer //----- //
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_0 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%__builtin_var_WorkgroupId___addr_1 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr_1 : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%__builtin_var_LocalInvocationId___addr_2 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_2 : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%__builtin_var_LocalInvocationId___addr_3 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr_3 : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%72: i32, %73: vector<4xf32>, %74: vector<4xf32>, %75: vector<4xf32>, %76: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%77 = spv.SLessThan %72, %cst3_i32 : i32
spv.BranchConditional %77, ^bb2, ^bb3
^bb2: // pred: ^bb1
%78 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%79 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %73, %74, %75, %76 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%87: i32, %88: vector<4xf32>, %89: vector<4xf32>, %90: vector<4xf32>, %91: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%92 = spv.SLessThan %87, %cst3_i32 : i32
spv.BranchConditional %92, ^bb2, ^bb3
^bb2: // pred: ^bb1
%93 = spv.IAdd %7, %16 : i32
%94 = spv.SDiv %93, %cst4_i32 : i32
%95 = spv.IMul %72, %cst72_i32 : i32
%96 = spv.IMul %87, %cst24_i32 : i32
%97 = spv.IAdd %95, %96 : i32
%98 = spv.IAdd %97, %94 : i32
%99 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %98] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%100 = spv.Load "StorageBuffer" %99 : vector<4xf32>
%101 = spv.IMul %72, %cst72_i32 : i32
%102 = spv.IMul %87, %cst24_i32 : i32
%103 = spv.IAdd %101, %102 : i32
%104 = spv.IAdd %103, %94 : i32
%105 = spv.IAdd %104, %cst8_i32 : i32
%106 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %105] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%107 = spv.Load "StorageBuffer" %106 : vector<4xf32>
%108 = spv.IMul %72, %cst72_i32 : i32
%109 = spv.IMul %87, %cst24_i32 : i32
%110 = spv.IAdd %108, %109 : i32
%111 = spv.IAdd %110, %94 : i32
%112 = spv.IAdd %111, %cst16_i32 : i32
%113 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %112] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%114 = spv.Load "StorageBuffer" %113 : vector<4xf32>
%115 = spv.IMul %72, %cst675_i32 : i32
%116 = spv.IMul %87, %cst3_i32 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.IMul %21, %cst675_i32 : i32
%119 = spv.IAdd %117, %118 : i32
%120 = spv.IMul %17, %cst675_i32 : i32
%121 = spv.IAdd %119, %120 : i32
%122 = spv.IMul %8, %cst3_i32 : i32
%123 = spv.IAdd %121, %122 : i32
%124 = spv.IMul %18, %cst3_i32 : i32
%125 = spv.IAdd %123, %124 : i32
%126 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %125] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%127 = spv.Load "StorageBuffer" %126 : f32
%128 = spv.CompositeInsert %127, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%129 = spv.IMul %72, %cst675_i32 : i32
%130 = spv.IMul %87, %cst3_i32 : i32
%131 = spv.IAdd %129, %130 : i32
%132 = spv.IMul %21, %cst675_i32 : i32
%133 = spv.IAdd %131, %132 : i32
%134 = spv.IMul %17, %cst675_i32 : i32
%135 = spv.IAdd %133, %134 : i32
%136 = spv.IMul %8, %cst3_i32 : i32
%137 = spv.IAdd %135, %136 : i32
%138 = spv.IMul %18, %cst3_i32 : i32
%139 = spv.IAdd %137, %138 : i32
%140 = spv.IAdd %139, %cst1_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %128[1 : i32] : f32 into vector<3xf32>
%144 = spv.IMul %72, %cst675_i32 : i32
%145 = spv.IMul %87, %cst3_i32 : i32
%146 = spv.IAdd %144, %145 : i32
%147 = spv.IMul %21, %cst675_i32 : i32
%148 = spv.IAdd %146, %147 : i32
%149 = spv.IMul %17, %cst675_i32 : i32
%150 = spv.IAdd %148, %149 : i32
%151 = spv.IMul %8, %cst3_i32 : i32
%152 = spv.IAdd %150, %151 : i32
%153 = spv.IMul %18, %cst3_i32 : i32
%154 = spv.IAdd %152, %153 : i32
%155 = spv.IAdd %154, %cst2_i32 : i32
%156 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %155] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%157 = spv.Load "StorageBuffer" %156 : f32
%158 = spv.CompositeInsert %157, %143[2 : i32] : f32 into vector<3xf32>
%159 = spv.CompositeExtract %158[0 : i32] : vector<3xf32>
%160 = spv.CompositeConstruct %159, %159, %159, %159 : vector<4xf32>
%161 = spv.GLSL.Fma %160, %100, %88 : vector<4xf32>
%162 = spv.CompositeExtract %158[1 : i32] : vector<3xf32>
%163 = spv.CompositeConstruct %162, %162, %162, %162 : vector<4xf32>
%164 = spv.GLSL.Fma %163, %107, %161 : vector<4xf32>
%165 = spv.CompositeExtract %158[2 : i32] : vector<3xf32>
%166 = spv.CompositeConstruct %165, %165, %165, %165 : vector<4xf32>
%167 = spv.GLSL.Fma %166, %114, %164 : vector<4xf32>
%168 = spv.IMul %72, %cst675_i32 : i32
%169 = spv.IMul %21, %cst675_i32 : i32
%170 = spv.IAdd %168, %169 : i32
%171 = spv.IMul %17, %cst675_i32 : i32
%172 = spv.IAdd %170, %171 : i32
%173 = spv.IMul %8, %cst3_i32 : i32
%174 = spv.IAdd %172, %173 : i32
%175 = spv.IMul %18, %cst3_i32 : i32
%176 = spv.IAdd %174, %175 : i32
%177 = spv.IMul %87, %cst3_i32 : i32
%178 = spv.IAdd %176, %177 : i32
%179 = spv.IAdd %178, %cst6_i32 : i32
%180 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %179] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%181 = spv.Load "StorageBuffer" %180 : f32
%182 = spv.CompositeInsert %181, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%183 = spv.IMul %72, %cst675_i32 : i32
%184 = spv.IMul %21, %cst675_i32 : i32
%185 = spv.IAdd %183, %184 : i32
%186 = spv.IMul %17, %cst675_i32 : i32
%187 = spv.IAdd %185, %186 : i32
%188 = spv.IMul %8, %cst3_i32 : i32
%189 = spv.IAdd %187, %188 : i32
%190 = spv.IMul %18, %cst3_i32 : i32
%191 = spv.IAdd %189, %190 : i32
%192 = spv.IMul %87, %cst3_i32 : i32
%193 = spv.IAdd %191, %192 : i32
%194 = spv.IAdd %193, %cst7_i32 : i32
%195 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %194] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%196 = spv.Load "StorageBuffer" %195 : f32
%197 = spv.CompositeInsert %196, %182[1 : i32] : f32 into vector<3xf32>
%198 = spv.IMul %72, %cst675_i32 : i32
%199 = spv.IMul %21, %cst675_i32 : i32
%200 = spv.IAdd %198, %199 : i32
%201 = spv.IMul %17, %cst675_i32 : i32
%202 = spv.IAdd %200, %201 : i32
%203 = spv.IMul %8, %cst3_i32 : i32
%204 = spv.IAdd %202, %203 : i32
%205 = spv.IMul %18, %cst3_i32 : i32
%206 = spv.IAdd %204, %205 : i32
%207 = spv.IMul %87, %cst3_i32 : i32
%208 = spv.IAdd %206, %207 : i32
%209 = spv.IAdd %208, %cst8_i32 : i32
%210 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %209] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%211 = spv.Load "StorageBuffer" %210 : f32
%212 = spv.CompositeInsert %211, %197[2 : i32] : f32 into vector<3xf32>
%213 = spv.CompositeExtract %212[0 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.GLSL.Fma %214, %100, %89 : vector<4xf32>
%216 = spv.CompositeExtract %212[1 : i32] : vector<3xf32>
%217 = spv.CompositeConstruct %216, %216, %216, %216 : vector<4xf32>
%218 = spv.GLSL.Fma %217, %107, %215 : vector<4xf32>
%219 = spv.CompositeExtract %212[2 : i32] : vector<3xf32>
%220 = spv.CompositeConstruct %219, %219, %219, %219 : vector<4xf32>
%221 = spv.GLSL.Fma %220, %114, %218 : vector<4xf32>
%222 = spv.IMul %72, %cst675_i32 : i32
%223 = spv.IMul %21, %cst675_i32 : i32
%224 = spv.IAdd %222, %223 : i32
%225 = spv.IMul %17, %cst675_i32 : i32
%226 = spv.IAdd %224, %225 : i32
%227 = spv.IMul %8, %cst3_i32 : i32
%228 = spv.IAdd %226, %227 : i32
%229 = spv.IMul %18, %cst3_i32 : i32
%230 = spv.IAdd %228, %229 : i32
%231 = spv.IMul %87, %cst3_i32 : i32
%232 = spv.IAdd %230, %231 : i32
%233 = spv.IAdd %232, %cst12_i32 : i32
%234 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %233] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%235 = spv.Load "StorageBuffer" %234 : f32
%236 = spv.CompositeInsert %235, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%237 = spv.IMul %72, %cst675_i32 : i32
%238 = spv.IMul %21, %cst675_i32 : i32
%239 = spv.IAdd %237, %238 : i32
%240 = spv.IMul %17, %cst675_i32 : i32
%241 = spv.IAdd %239, %240 : i32
%242 = spv.IMul %8, %cst3_i32 : i32
%243 = spv.IAdd %241, %242 : i32
%244 = spv.IMul %18, %cst3_i32 : i32
%245 = spv.IAdd %243, %244 : i32
%246 = spv.IMul %87, %cst3_i32 : i32
%247 = spv.IAdd %245, %246 : i32
%248 = spv.IAdd %247, %cst13_i32 : i32
%249 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %248] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%250 = spv.Load "StorageBuffer" %249 : f32
%251 = spv.CompositeInsert %250, %236[1 : i32] : f32 into vector<3xf32>
%252 = spv.IMul %72, %cst675_i32 : i32
%253 = spv.IMul %21, %cst675_i32 : i32
%254 = spv.IAdd %252, %253 : i32
%255 = spv.IMul %17, %cst675_i32 : i32
%256 = spv.IAdd %254, %255 : i32
%257 = spv.IMul %8, %cst3_i32 : i32
%258 = spv.IAdd %256, %257 : i32
%259 = spv.IMul %18, %cst3_i32 : i32
%260 = spv.IAdd %258, %259 : i32
%261 = spv.IMul %87, %cst3_i32 : i32
%262 = spv.IAdd %260, %261 : i32
%263 = spv.IAdd %262, %cst14_i32 : i32
%264 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %263] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%265 = spv.Load "StorageBuffer" %264 : f32
%266 = spv.CompositeInsert %265, %251[2 : i32] : f32 into vector<3xf32>
%267 = spv.CompositeExtract %266[0 : i32] : vector<3xf32>
%268 = spv.CompositeConstruct %267, %267, %267, %267 : vector<4xf32>
%269 = spv.GLSL.Fma %268, %100, %90 : vector<4xf32>
%270 = spv.CompositeExtract %266[1 : i32] : vector<3xf32>
%271 = spv.CompositeConstruct %270, %270, %270, %270 : vector<4xf32>
%272 = spv.GLSL.Fma %271, %107, %269 : vector<4xf32>
%273 = spv.CompositeExtract %266[2 : i32] : vector<3xf32>
%274 = spv.CompositeConstruct %273, %273, %273, %273 : vector<4xf32>
%275 = spv.GLSL.Fma %274, %114, %272 : vector<4xf32>
%276 = spv.IMul %72, %cst675_i32 : i32
%277 = spv.IMul %21, %cst675_i32 : i32
%278 = spv.IAdd %276, %277 : i32
%279 = spv.IMul %17, %cst675_i32 : i32
%280 = spv.IAdd %278, %279 : i32
%281 = spv.IMul %8, %cst3_i32 : i32
%282 = spv.IAdd %280, %281 : i32
%283 = spv.IMul %18, %cst3_i32 : i32
%284 = spv.IAdd %282, %283 : i32
%285 = spv.IMul %87, %cst3_i32 : i32
%286 = spv.IAdd %284, %285 : i32
%287 = spv.IAdd %286, %cst18_i32 : i32
%288 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %287] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%289 = spv.Load "StorageBuffer" %288 : f32
%290 = spv.CompositeInsert %289, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%291 = spv.IMul %72, %cst675_i32 : i32
%292 = spv.IMul %21, %cst675_i32 : i32
%293 = spv.IAdd %291, %292 : i32
%294 = spv.IMul %17, %cst675_i32 : i32
%295 = spv.IAdd %293, %294 : i32
%296 = spv.IMul %8, %cst3_i32 : i32
%297 = spv.IAdd %295, %296 : i32
%298 = spv.IMul %18, %cst3_i32 : i32
%299 = spv.IAdd %297, %298 : i32
%300 = spv.IMul %87, %cst3_i32 : i32
%301 = spv.IAdd %299, %300 : i32
%302 = spv.IAdd %301, %cst19_i32 : i32
%303 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %302] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%304 = spv.Load "StorageBuffer" %303 : f32
%305 = spv.CompositeInsert %304, %290[1 : i32] : f32 into vector<3xf32>
%306 = spv.IMul %72, %cst675_i32 : i32
%307 = spv.IMul %21, %cst675_i32 : i32
%308 = spv.IAdd %306, %307 : i32
%309 = spv.IMul %17, %cst675_i32 : i32
%310 = spv.IAdd %308, %309 : i32
%311 = spv.IMul %8, %cst3_i32 : i32
%312 = spv.IAdd %310, %311 : i32
%313 = spv.IMul %18, %cst3_i32 : i32
%314 = spv.IAdd %312, %313 : i32
%315 = spv.IMul %87, %cst3_i32 : i32
%316 = spv.IAdd %314, %315 : i32
%317 = spv.IAdd %316, %cst20_i32 : i32
%318 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %317] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%319 = spv.Load "StorageBuffer" %318 : f32
%320 = spv.CompositeInsert %319, %305[2 : i32] : f32 into vector<3xf32>
%321 = spv.CompositeExtract %320[0 : i32] : vector<3xf32>
%322 = spv.CompositeConstruct %321, %321, %321, %321 : vector<4xf32>
%323 = spv.GLSL.Fma %322, %100, %91 : vector<4xf32>
%324 = spv.CompositeExtract %320[1 : i32] : vector<3xf32>
%325 = spv.CompositeConstruct %324, %324, %324, %324 : vector<4xf32>
%326 = spv.GLSL.Fma %325, %107, %323 : vector<4xf32>
%327 = spv.CompositeExtract %320[2 : i32] : vector<3xf32>
%328 = spv.CompositeConstruct %327, %327, %327, %327 : vector<4xf32>
%329 = spv.GLSL.Fma %328, %114, %326 : vector<4xf32>
spv.Store "Function" %78, %167 : vector<4xf32>
spv.Store "Function" %79, %221 : vector<4xf32>
spv.Store "Function" %80, %275 : vector<4xf32>
spv.Store "Function" %81, %329 : vector<4xf32>
%330 = spv.IAdd %87, %cst1_i32 : i32
spv.Branch ^bb1(%330, %167, %221, %275, %329 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%82 = spv.Load "Function" %81 : vector<4xf32>
%83 = spv.Load "Function" %80 : vector<4xf32>
%84 = spv.Load "Function" %79 : vector<4xf32>
%85 = spv.Load "Function" %78 : vector<4xf32>
spv.Store "Function" %22, %85 : vector<4xf32>
spv.Store "Function" %23, %84 : vector<4xf32>
spv.Store "Function" %24, %83 : vector<4xf32>
spv.Store "Function" %25, %82 : vector<4xf32>
%86 = spv.IAdd %72, %cst1_i32 : i32
spv.Branch ^bb1(%86, %85, %84, %83, %82 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IMul %19, %cst896_i32 : i32
%43 = spv.IAdd %31, %42 : i32
%44 = spv.IMul %14, %cst896_i32 : i32
%45 = spv.IAdd %43, %44 : i32
%46 = spv.IMul %6, %cst8_i32 : i32
%47 = spv.IAdd %45, %46 : i32
%48 = spv.IMul %15, %cst8_i32 : i32
%49 = spv.IAdd %47, %48 : i32
%50 = spv.IAdd %49, %cst16_i32 : i32
%51 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %50] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %51, %27 : vector<4xf32>
%52 = spv.IMul %19, %cst896_i32 : i32
%53 = spv.IAdd %31, %52 : i32
%54 = spv.IMul %14, %cst896_i32 : i32
%55 = spv.IAdd %53, %54 : i32
%56 = spv.IMul %6, %cst8_i32 : i32
%57 = spv.IAdd %55, %56 : i32
%58 = spv.IMul %15, %cst8_i32 : i32
%59 = spv.IAdd %57, %58 : i32
%60 = spv.IAdd %59, %cst8_i32 : i32
%61 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %60] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %61, %28 : vector<4xf32>
%62 = spv.IMul %19, %cst896_i32 : i32
%63 = spv.IAdd %31, %62 : i32
%64 = spv.IMul %14, %cst896_i32 : i32
%65 = spv.IAdd %63, %64 : i32
%66 = spv.IMul %6, %cst8_i32 : i32
%67 = spv.IAdd %65, %66 : i32
%68 = spv.IMul %15, %cst8_i32 : i32
%69 = spv.IAdd %67, %68 : i32
%70 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %69] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %70, %29 : vector<4xf32>
%71 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%71 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
// -----// IR Dump After CSE //----- //
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
// -----// IR Dump After SPIRVUpdateVCE //----- //
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- //
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%c112 = constant 112 : index
%c32 = constant 32 : index
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%c1 = constant 1 : index
%c225 = constant 225 : index
%c3 = constant 3 : index
%sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c225, %c225, %c3]) type(%c50331680_i32) encoding(%c1_i32) : index
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%c32_1 = constant 32 : index
%sz_2 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c3, %c3, %c3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
%c112_3 = constant 112 : index
%sz_4 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112_3, %c112_3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
%buffer_5 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz_4}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%c112_6 = constant 112 : index
%c32_7 = constant 32 : index
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
%c0 = constant 0 : index
%c2 = constant 2 : index
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %sz],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %sz_2],
%c2 = (%buffer_5 : !hal.buffer)[%c0, %sz_4]
])
%c1_14 = constant 1 : index
%c14 = constant 14 : index
%c112_15 = constant 112 : index
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1_14, %c14, %c112_15])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%c1_8 = constant 1 : index
%c112_9 = constant 112 : index
%c112_10 = constant 112 : index
%c32_11 = constant 32 : index
%c50331680_i32_12 = constant 50331680 : i32
%c1_i32_13 = constant 1 : i32
%view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c1_8, %c112_9, %c112_10, %c32_11]) type(%c50331680_i32_12) encoding(%c1_i32_13) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionRankedShapeDimsPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%c112 = constant 112 : index
%c32 = constant 32 : index
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%c1 = constant 1 : index
%c225 = constant 225 : index
%c3 = constant 3 : index
%sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c225, %c225, %c3]) type(%c50331680_i32) encoding(%c1_i32) : index
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%c32_1 = constant 32 : index
%sz_2 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c3, %c3, %c3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
%c112_3 = constant 112 : index
%sz_4 = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112_3, %c112_3, %c32_1]) type(%c50331680_i32) encoding(%c1_i32) : index
%buffer_5 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz_4}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%c112_6 = constant 112 : index
%c32_7 = constant 32 : index
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
%c0 = constant 0 : index
%c2 = constant 2 : index
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %sz],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %sz_2],
%c2 = (%buffer_5 : !hal.buffer)[%c0, %sz_4]
])
%c1_14 = constant 1 : index
%c14 = constant 14 : index
%c112_15 = constant 112 : index
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1_14, %c14, %c112_15])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%c1_8 = constant 1 : index
%c112_9 = constant 112 : index
%c112_10 = constant 112 : index
%c32_11 = constant 32 : index
%c50331680_i32_12 = constant 50331680 : i32
%c1_i32_13 = constant 1 : i32
%view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c1_8, %c112_9, %c112_10, %c32_11]) type(%c50331680_i32_12) encoding(%c1_i32_13) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PackAllocationsPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkTargetExecutablesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkExecutablesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv_fb::@conv_dispatch_0) workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::ResolveEntryPointOrdinalsPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
%exe = hal.executable.lookup device(%0 : !hal.device) executable(@conv_dispatch_0) : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c607500 = constant 607500 : index
%c3456 = constant 3456 : index
%c1605632 = constant 1605632 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%exe = hal.executable.lookup device(%device : !hal.device) executable(@conv_dispatch_0) : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c607500 = constant 607500 : index
%c3456 = constant 3456 : index
%c1605632 = constant 1605632 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%exe = hal.executable.lookup device(%device : !hal.device) executable(@conv_dispatch_0) : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_conv_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%0 = hal.device.switch<%device : !hal.device> -> !hal.executable
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
hal.return %exe : !hal.executable
},
#hal.match.always {
%1 = util.null : !hal.executable
hal.return %1 : !hal.executable
}
util.global.store %0, @_executable_conv_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c607500 = constant 607500 : index
%c3456 = constant 3456 : index
%c1605632 = constant 1605632 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
hal.device.switch<%device : !hal.device>
#hal.device.match.executable.format<"vulkan-spirv-fb"> {
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = constant true
cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c607500 = constant 607500 : index
%c3456 = constant 3456 : index
%c1605632 = constant 1605632 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
br ^bb3
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
^bb3: // pred: ^bb1
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
// -----// IR Dump After ConvertAffineToStandard //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_conv_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = constant true
cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c607500 = constant 607500 : index
%c3456 = constant 3456 : index
%c1605632 = constant 1605632 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
br ^bb3
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
^bb3: // pred: ^bb1
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
util.global private @_device_query_0 : i1
util.global private @_device_query_0_ok : i1
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
util.global private @_executable_layout_0 : !hal.executable_layout
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
util.global private @_executable_conv_dispatch_0 : !hal.executable
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = constant true
cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%0 = util.null : !hal.executable
br ^bb5(%0 : !hal.executable)
^bb4: // pred: ^bb2
util.unreachable "device not supported in the compiled configuration"
^bb5(%1: !hal.executable): // 2 preds: ^bb1, ^bb3
util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
util.initializer.return
}
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_0_0_ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_1_ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_0_2_ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%cst896_i32 = spv.Constant 896 : i32
%cst20_i32 = spv.Constant 20 : i32
%cst19_i32 = spv.Constant 19 : i32
%cst18_i32 = spv.Constant 18 : i32
%cst14_i32 = spv.Constant 14 : i32
%cst13_i32 = spv.Constant 13 : i32
%cst12_i32 = spv.Constant 12 : i32
%cst7_i32 = spv.Constant 7 : i32
%cst6_i32 = spv.Constant 6 : i32
%cst675_i32 = spv.Constant 675 : i32
%cst24_i32 = spv.Constant 24 : i32
%cst72_i32 = spv.Constant 72 : i32
%cst_vec_4xf32 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%cst_vec_3xf32 = spv.Constant dense<0.000000e+00> : vector<3xf32>
%cst8_i32 = spv.Constant 8 : i32
%cst2_i32 = spv.Constant 2 : i32
%cst4_i32 = spv.Constant 4 : i32
%cst16_i32 = spv.Constant 16 : i32
%cst32_i32 = spv.Constant 32 : i32
%cst0_i32 = spv.Constant 0 : i32
%cst112_i32 = spv.Constant 112 : i32
%cst3_i32 = spv.Constant 3 : i32
%cst1_i32 = spv.Constant 1 : i32
%__resource_var_0_0__addr = spv.mlir.addressof @__resource_var_0_0_ : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
%__resource_var_0_1__addr = spv.mlir.addressof @__resource_var_0_1_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__resource_var_0_2__addr = spv.mlir.addressof @__resource_var_0_2_ : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
%__builtin_var_WorkgroupId___addr = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%0 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32>
%2 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%3 = spv.CompositeExtract %2[1 : i32] : vector<3xi32>
%4 = spv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi32>
%5 = spv.CompositeExtract %4[2 : i32] : vector<3xi32>
%6 = spv.IMul %3, %cst8_i32 : i32
%7 = spv.IMul %1, %cst32_i32 : i32
%8 = spv.IMul %3, %cst16_i32 : i32
%__builtin_var_LocalInvocationId___addr = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%9 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
%11 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%12 = spv.CompositeExtract %11[1 : i32] : vector<3xi32>
%13 = spv.Load "Input" %__builtin_var_LocalInvocationId___addr : vector<3xi32>
%14 = spv.CompositeExtract %13[2 : i32] : vector<3xi32>
%15 = spv.IMul %12, %cst4_i32 : i32
%16 = spv.IMul %10, %cst4_i32 : i32
%17 = spv.IMul %14, %cst2_i32 : i32
%18 = spv.IMul %12, %cst8_i32 : i32
spv.mlir.loop {
spv.Branch ^bb1(%5 : i32)
^bb1(%19: i32): // 2 preds: ^bb0, ^bb2
%20 = spv.SLessThan %19, %cst112_i32 : i32
spv.BranchConditional %20, ^bb2, ^bb3
^bb2: // pred: ^bb1
%21 = spv.IMul %19, %cst2_i32 : i32
%22 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%23 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%24 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%25 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32, %cst_vec_4xf32 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%48: i32, %49: vector<4xf32>, %50: vector<4xf32>, %51: vector<4xf32>, %52: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%53 = spv.SLessThan %48, %cst3_i32 : i32
spv.BranchConditional %53, ^bb2, ^bb3
^bb2: // pred: ^bb1
%54 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%55 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%56 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%57 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%cst0_i32, %49, %50, %51, %52 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%63: i32, %64: vector<4xf32>, %65: vector<4xf32>, %66: vector<4xf32>, %67: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%68 = spv.SLessThan %63, %cst3_i32 : i32
spv.BranchConditional %68, ^bb2, ^bb3
^bb2: // pred: ^bb1
%69 = spv.IAdd %7, %16 : i32
%70 = spv.SDiv %69, %cst4_i32 : i32
%71 = spv.IMul %48, %cst72_i32 : i32
%72 = spv.IMul %63, %cst24_i32 : i32
%73 = spv.IAdd %71, %72 : i32
%74 = spv.IAdd %73, %70 : i32
%75 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %74] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%76 = spv.Load "StorageBuffer" %75 : vector<4xf32>
%77 = spv.IAdd %74, %cst8_i32 : i32
%78 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %77] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%79 = spv.Load "StorageBuffer" %78 : vector<4xf32>
%80 = spv.IAdd %74, %cst16_i32 : i32
%81 = spv.AccessChain %__resource_var_0_1__addr[%cst0_i32, %80] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%82 = spv.Load "StorageBuffer" %81 : vector<4xf32>
%83 = spv.IMul %48, %cst675_i32 : i32
%84 = spv.IMul %63, %cst3_i32 : i32
%85 = spv.IAdd %83, %84 : i32
%86 = spv.IMul %21, %cst675_i32 : i32
%87 = spv.IAdd %85, %86 : i32
%88 = spv.IMul %17, %cst675_i32 : i32
%89 = spv.IAdd %87, %88 : i32
%90 = spv.IMul %8, %cst3_i32 : i32
%91 = spv.IAdd %89, %90 : i32
%92 = spv.IMul %18, %cst3_i32 : i32
%93 = spv.IAdd %91, %92 : i32
%94 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %93] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%95 = spv.Load "StorageBuffer" %94 : f32
%96 = spv.CompositeInsert %95, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%97 = spv.IAdd %93, %cst1_i32 : i32
%98 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %97] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%99 = spv.Load "StorageBuffer" %98 : f32
%100 = spv.CompositeInsert %99, %96[1 : i32] : f32 into vector<3xf32>
%101 = spv.IAdd %93, %cst2_i32 : i32
%102 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %101] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%103 = spv.Load "StorageBuffer" %102 : f32
%104 = spv.CompositeInsert %103, %100[2 : i32] : f32 into vector<3xf32>
%105 = spv.CompositeExtract %104[0 : i32] : vector<3xf32>
%106 = spv.CompositeConstruct %105, %105, %105, %105 : vector<4xf32>
%107 = spv.GLSL.Fma %106, %76, %64 : vector<4xf32>
%108 = spv.CompositeExtract %104[1 : i32] : vector<3xf32>
%109 = spv.CompositeConstruct %108, %108, %108, %108 : vector<4xf32>
%110 = spv.GLSL.Fma %109, %79, %107 : vector<4xf32>
%111 = spv.CompositeExtract %104[2 : i32] : vector<3xf32>
%112 = spv.CompositeConstruct %111, %111, %111, %111 : vector<4xf32>
%113 = spv.GLSL.Fma %112, %82, %110 : vector<4xf32>
%114 = spv.IAdd %83, %86 : i32
%115 = spv.IAdd %114, %88 : i32
%116 = spv.IAdd %115, %90 : i32
%117 = spv.IAdd %116, %92 : i32
%118 = spv.IAdd %117, %84 : i32
%119 = spv.IAdd %118, %cst6_i32 : i32
%120 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %119] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%121 = spv.Load "StorageBuffer" %120 : f32
%122 = spv.CompositeInsert %121, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%123 = spv.IAdd %118, %cst7_i32 : i32
%124 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %123] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeInsert %125, %122[1 : i32] : f32 into vector<3xf32>
%127 = spv.IAdd %118, %cst8_i32 : i32
%128 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %127] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%129 = spv.Load "StorageBuffer" %128 : f32
%130 = spv.CompositeInsert %129, %126[2 : i32] : f32 into vector<3xf32>
%131 = spv.CompositeExtract %130[0 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.GLSL.Fma %132, %76, %65 : vector<4xf32>
%134 = spv.CompositeExtract %130[1 : i32] : vector<3xf32>
%135 = spv.CompositeConstruct %134, %134, %134, %134 : vector<4xf32>
%136 = spv.GLSL.Fma %135, %79, %133 : vector<4xf32>
%137 = spv.CompositeExtract %130[2 : i32] : vector<3xf32>
%138 = spv.CompositeConstruct %137, %137, %137, %137 : vector<4xf32>
%139 = spv.GLSL.Fma %138, %82, %136 : vector<4xf32>
%140 = spv.IAdd %118, %cst12_i32 : i32
%141 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %140] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%142 = spv.Load "StorageBuffer" %141 : f32
%143 = spv.CompositeInsert %142, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%144 = spv.IAdd %118, %cst13_i32 : i32
%145 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %144] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%146 = spv.Load "StorageBuffer" %145 : f32
%147 = spv.CompositeInsert %146, %143[1 : i32] : f32 into vector<3xf32>
%148 = spv.IAdd %118, %cst14_i32 : i32
%149 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %148] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%150 = spv.Load "StorageBuffer" %149 : f32
%151 = spv.CompositeInsert %150, %147[2 : i32] : f32 into vector<3xf32>
%152 = spv.CompositeExtract %151[0 : i32] : vector<3xf32>
%153 = spv.CompositeConstruct %152, %152, %152, %152 : vector<4xf32>
%154 = spv.GLSL.Fma %153, %76, %66 : vector<4xf32>
%155 = spv.CompositeExtract %151[1 : i32] : vector<3xf32>
%156 = spv.CompositeConstruct %155, %155, %155, %155 : vector<4xf32>
%157 = spv.GLSL.Fma %156, %79, %154 : vector<4xf32>
%158 = spv.CompositeExtract %151[2 : i32] : vector<3xf32>
%159 = spv.CompositeConstruct %158, %158, %158, %158 : vector<4xf32>
%160 = spv.GLSL.Fma %159, %82, %157 : vector<4xf32>
%161 = spv.IAdd %118, %cst18_i32 : i32
%162 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %161] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%163 = spv.Load "StorageBuffer" %162 : f32
%164 = spv.CompositeInsert %163, %cst_vec_3xf32[0 : i32] : f32 into vector<3xf32>
%165 = spv.IAdd %118, %cst19_i32 : i32
%166 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %165] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%167 = spv.Load "StorageBuffer" %166 : f32
%168 = spv.CompositeInsert %167, %164[1 : i32] : f32 into vector<3xf32>
%169 = spv.IAdd %118, %cst20_i32 : i32
%170 = spv.AccessChain %__resource_var_0_0__addr[%cst0_i32, %169] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.CompositeInsert %171, %168[2 : i32] : f32 into vector<3xf32>
%173 = spv.CompositeExtract %172[0 : i32] : vector<3xf32>
%174 = spv.CompositeConstruct %173, %173, %173, %173 : vector<4xf32>
%175 = spv.GLSL.Fma %174, %76, %67 : vector<4xf32>
%176 = spv.CompositeExtract %172[1 : i32] : vector<3xf32>
%177 = spv.CompositeConstruct %176, %176, %176, %176 : vector<4xf32>
%178 = spv.GLSL.Fma %177, %79, %175 : vector<4xf32>
%179 = spv.CompositeExtract %172[2 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.GLSL.Fma %180, %82, %178 : vector<4xf32>
spv.Store "Function" %54, %113 : vector<4xf32>
spv.Store "Function" %55, %139 : vector<4xf32>
spv.Store "Function" %56, %160 : vector<4xf32>
spv.Store "Function" %57, %181 : vector<4xf32>
%182 = spv.IAdd %63, %cst1_i32 : i32
spv.Branch ^bb1(%182, %113, %139, %160, %181 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%58 = spv.Load "Function" %57 : vector<4xf32>
%59 = spv.Load "Function" %56 : vector<4xf32>
%60 = spv.Load "Function" %55 : vector<4xf32>
%61 = spv.Load "Function" %54 : vector<4xf32>
spv.Store "Function" %22, %61 : vector<4xf32>
spv.Store "Function" %23, %60 : vector<4xf32>
spv.Store "Function" %24, %59 : vector<4xf32>
spv.Store "Function" %25, %58 : vector<4xf32>
%62 = spv.IAdd %48, %cst1_i32 : i32
spv.Branch ^bb1(%62, %61, %60, %59, %58 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%26 = spv.Load "Function" %25 : vector<4xf32>
%27 = spv.Load "Function" %24 : vector<4xf32>
%28 = spv.Load "Function" %23 : vector<4xf32>
%29 = spv.Load "Function" %22 : vector<4xf32>
%30 = spv.IAdd %7, %16 : i32
%31 = spv.SDiv %30, %cst4_i32 : i32
%32 = spv.IMul %19, %cst896_i32 : i32
%33 = spv.IAdd %31, %32 : i32
%34 = spv.IMul %14, %cst896_i32 : i32
%35 = spv.IAdd %33, %34 : i32
%36 = spv.IMul %6, %cst8_i32 : i32
%37 = spv.IAdd %35, %36 : i32
%38 = spv.IMul %15, %cst8_i32 : i32
%39 = spv.IAdd %37, %38 : i32
%40 = spv.IAdd %39, %cst24_i32 : i32
%41 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %40] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %41, %26 : vector<4xf32>
%42 = spv.IAdd %39, %cst16_i32 : i32
%43 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %42] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %43, %27 : vector<4xf32>
%44 = spv.IAdd %39, %cst8_i32 : i32
%45 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %44] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %45, %28 : vector<4xf32>
%46 = spv.AccessChain %__resource_var_0_2__addr[%cst0_i32, %39] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %46, %29 : vector<4xf32>
%47 = spv.IAdd %19, %cst112_i32 : i32
spv.Branch ^bb1(%47 : i32)
^bb3: // pred: ^bb1
spv.mlir.merge
}
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 8, 2, 1
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c607500 = constant 607500 : index
%c3456 = constant 3456 : index
%c1605632 = constant 1605632 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%c1_i32 = constant 1 : i32
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_device_query_0_ok = util.global.load @_device_query_0_ok : i1
%_device_query_0 = util.global.load @_device_query_0 : i1
cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
br ^bb3
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
^bb3: // pred: ^bb1
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
// -----// IR Dump After CSE //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
util.global.store %ok, @_device_query_0_ok : i1
util.global.store %value, @_device_query_0 : i1
util.initializer.return
}
// -----// IR Dump After Canonicalizer //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
// -----// IR Dump After Canonicalizer //----- //
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
// -----// IR Dump After CSE //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
util.initializer.return
}
// -----// IR Dump After Canonicalizer //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After CSE //----- //
util.initializer {
%_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
util.initializer.return
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After CSE //----- //
util.initializer {
%device = hal.ex.shared_device : !hal.device
%ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "vulkan-spirv-fb") : i1, i1 = false
cond_br %value, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv_fb) layouts([%_executable_layout_0]) : !hal.executable
br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%0 = util.null : !hal.executable
br ^bb3(%0 : !hal.executable)
^bb3(%1: !hal.executable): // 2 preds: ^bb1, ^bb2
util.global.store %1, @_executable_conv_dispatch_0 : !hal.executable
util.initializer.return
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeTargetExecutablesPass //----- //
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.binary public @vulkan_spirv_fb attributes {data = dense<"0x0800000053505645A0E9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30008D050000030223070000010016000000F30000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000B0006008C000000474C534C2E7374642E343530000000000E00030000000000010000000F0009000500000013000000636F6E765F64697370617463685F3000050000000400000010000600130000001100000008000000020000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050007000A0000005F5F7265736F757263655F7661725F305F305F00050007000F0000005F5F7265736F757263655F7661725F305F315F0005000700100000005F5F7265736F757263655F7661725F305F325F000500060013000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000040000004800050007000000000000002300000000000000470003000700000002000000470004000A0000002100000000000000470004000A0000002200000000000000470004000D0000000600000010000000480005000C000000000000002300000000000000470003000C00000002000000470004000F0000002100000001000000470004000F0000002200000000000000470003000C0000000200000047000400100000002100000002000000470004001000000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B0004000100000005000000010000001600030009000000200000001D00030008000000090000001E000300070000000800000020000400060000000C000000070000003B000400060000000A0000000C000000170004000E00000009000000040000001D0003000D0000000E0000001E0003000C0000000D000000200004000B0000000C0000000C0000003B0004000B0000000F0000000C0000003B0004000B000000100000000C00000013000200120000002100030011000000120000002B0004000300000015000000800300002B0004000300000016000000140000002B0004000300000017000000130000002B0004000300000018000000120000002B00040003000000190000000E0000002B000400030000001A0000000D0000002B000400030000001B0000000C0000002B000400030000001C000000070000002B000400030000001D000000060000002B000400030000001E000000A30200002B000400030000001F000000180000002B0004000300000020000000480000002B0004000900000022000000000000002C0007000E0000002100000022000000220000002200000022000000170004002300000009000000030000002C00060023000000240000002200000022000000220000002B0004000300000025000000080000002B0004000300000026000000020000002B0004000300000027000000040000002B0004000300000028000000100000002B0004000300000029000000200000002B000400030000002A000000000000002B000400030000002B000000700000002B000400030000002C000000030000002B000400030000002D0000000100000014000200460000002000040049000000070000000E00000020000400690000000C0000000E000000200004007D0000000C000000090000003600050012000000130000000000000011000000F8000200140000003B000400490000004A000000070000003B000400490000004B000000070000003B000400490000004C000000070000003B000400490000004D000000070000003B0004004900000057000000070000003B0004004900000058000000070000003B0004004900000059000000070000003B000400490000005A000000070000003D000400020000002E0000000500000051000500030000002F0000002E000000000000003D00040002000000300000000500000051000500030000003100000030000000010000003D000400020000003200000005000000510005000300000033000000320000000200000084000500030000003400000031000000250000008400050003000000350000002F0000002900000084000500030000003600000031000000280000003D00040002000000370000000400000051000500030000003800000037000000000000003D00040002000000390000000400000051000500030000003A00000039000000010000003D000400020000003B0000000400000051000500030000003C0000003B0000000200000084000500030000003D0000003A0000002700000084000500030000003E000000380000002700000084000500030000003F0000003C000000260000008400050003000000400000003A00000025000000F900020041000000F800020041000000F50007000300000044000000F2000000450000003300000014000000B10005004600000047000000440000002B000000F6000400430000004200000000000000FA000400470000004200000043000000F8000200420000008400050003000000480000004400000026000000F90002004E000000F80002004E000000F50007000300000050000000DC000000510000002A00000042000000F50007000E00000052000000DB000000510000002100000042000000F50007000E00000053000000DA000000510000002100000042000000F50007000E00000054000000D9000000510000002100000042000000F50007000E00000055000000D8000000510000002100000042000000B10005004600000056000000500000002C000000F6000400450000004F00000000000000FA000400560000004F00000045000000F80002004F000000F90002005B000000F80002005B000000F5000700030000005D000000D70000005C0000002A0000004F000000F50007000E0000005E000000920000005C000000520000004F000000F50007000E0000005F000000AC0000005C000000530000004F000000F50007000E00000060000000C10000005C000000540000004F000000F50007000E00000061000000D60000005C000000550000004F000000B100050046000000620000005D0000002C000000F6000400510000005C00000000000000FA000400620000005C00000051000000F80002005C000000800005000300000063000000350000003E000000870005000300000064000000630000002700000084000500030000006500000050000000200000008400050003000000660000005D0000001F0000008000050003000000670000006500000066000000800005000300000068000000670000006400000041000600690000006A0000000F0000002A000000680000003D0004000E0000006B0000006A00000080000500030000006C000000680000002500000041000600690000006D0000000F0000002A0000006C0000003D0004000E0000006E0000006D00000080000500030000006F00000068000000280000004100060069000000700000000F0000002A0000006F0000003D0004000E0000007100000070000000840005000300000072000000500000001E0000008400050003000000730000005D0000002C0000008000050003000000740000007200000073000000840005000300000075000000480000001E00000080000500030000007600000074000000750000008400050003000000770000003F0000001E0000008000050003000000780000007600000077000000840005000300000079000000360000002C00000080000500030000007A000000780000007900000084000500030000007B000000400000002C00000080000500030000007C0000007A0000007B000000410006007D0000007E0000000A0000002A0000007C0000003D000400090000007F0000007E0000005200060023000000800000007F00000024000000000000008000050003000000810000007C0000002D000000410006007D000000820000000A0000002A000000810000003D0004000900000083000000820000005200060023000000840000008300000080000000010000008000050003000000850000007C00000026000000410006007D000000860000000A0000002A000000850000003D0004000900000087000000860000005200060023000000880000008700000084000000020000005100050009000000890000008800000000000000500007000E0000008A000000890000008900000089000000890000000C0008000E0000008B0000008C000000320000008A0000006B0000005E00000051000500090000008D0000008800000001000000500007000E0000008E0000008D0000008D0000008D0000008D0000000C0008000E0000008F0000008C000000320000008E0000006E0000008B0000005100050009000000900000008800000002000000500007000E00000091000000900000009000000090000000900000000C0008000E000000920000008C0000003200000091000000710000008F000000800005000300000093000000720000007500000080000500030000009400000093000000770000008000050003000000950000009400000079000000800005000300000096000000950000007B0000008000050003000000970000009600000073000000800005000300000098000000970000001D000000410006007D000000990000000A0000002A000000980000003D000400090000009A0000009900000052000600230000009B0000009A000000240000000000000080000500030000009C000000970000001C000000410006007D0000009D0000000A0000002A0000009C0000003D000400090000009E0000009D00000052000600230000009F0000009E0000009B000000010000008000050003000000A00000009700000025000000410006007D000000A10000000A0000002A000000A00000003D00040009000000A2000000A10000005200060023000000A3000000A20000009F000000020000005100050009000000A4000000A300000000000000500007000E000000A5000000A4000000A4000000A4000000A40000000C0008000E000000A60000008C00000032000000A50000006B0000005F0000005100050009000000A7000000A300000001000000500007000E000000A8000000A7000000A7000000A7000000A70000000C0008000E000000A90000008C00000032000000A80000006E000000A60000005100050009000000AA000000A300000002000000500007000E000000AB000000AA000000AA000000AA000000AA0000000C0008000E000000AC0000008C00000032000000AB00000071000000A90000008000050003000000AD000000970000001B000000410006007D000000AE0000000A0000002A000000AD0000003D00040009000000AF000000AE0000005200060023000000B0000000AF00000024000000000000008000050003000000B1000000970000001A000000410006007D000000B20000000A0000002A000000B10000003D00040009000000B3000000B20000005200060023000000B4000000B3000000B0000000010000008000050003000000B50000009700000019000000410006007D000000B60000000A0000002A000000B50000003D00040009000000B7000000B60000005200060023000000B8000000B7000000B4000000020000005100050009000000B9000000B800000000000000500007000E000000BA000000B9000000B9000000B9000000B90000000C0008000E000000BB0000008C00000032000000BA0000006B000000600000005100050009000000BC000000B800000001000000500007000E000000BD000000BC000000BC000000BC000000BC0000000C0008000E000000BE0000008C00000032000000BD0000006E000000BB0000005100050009000000BF000000B800000002000000500007000E000000C0000000BF000000BF000000BF000000BF0000000C0008000E000000C10000008C00000032000000C000000071000000BE0000008000050003000000C20000009700000018000000410006007D000000C30000000A0000002A000000C20000003D00040009000000C4000000C30000005200060023000000C5000000C400000024000000000000008000050003000000C60000009700000017000000410006007D000000C70000000A0000002A000000C60000003D00040009000000C8000000C70000005200060023000000C9000000C8000000C5000000010000008000050003000000CA0000009700000016000000410006007D000000CB0000000A0000002A000000CA0000003D00040009000000CC000000CB0000005200060023000000CD000000CC000000C9000000020000005100050009000000CE000000CD00000000000000500007000E000000CF000000CE000000CE000000CE000000CE0000000C0008000E000000D00000008C00000032000000CF0000006B000000610000005100050009000000D1000000CD00000001000000500007000E000000D2000000D1000000D1000000D1000000D10000000C0008000E000000D30000008C00000032000000D20000006E000000D00000005100050009000000D4000000CD00000002000000500007000E000000D5000000D4000000D4000000D4000000D40000000C0008000E000000D60000008C00000032000000D500000071000000D30000003E00030057000000920000003E00030058000000AC0000003E00030059000000C10000003E0003005A000000D60000008000050003000000D70000005D0000002D000000F90002005B000000F8000200510000003D0004000E000000D80000005A0000003D0004000E000000D9000000590000003D0004000E000000DA000000580000003D0004000E000000DB000000570000003E0003004A000000DB0000003E0003004B000000DA0000003E0003004C000000D90000003E0003004D000000D80000008000050003000000DC000000500000002D000000F90002004E000000F8000200450000003D0004000E000000DD0000004D0000003D0004000E000000DE0000004C0000003D0004000E000000DF0000004B0000003D0004000E000000E00000004A0000008000050003000000E1000000350000003E0000008700050003000000E2000000E1000000270000008400050003000000E300000044000000150000008000050003000000E4000000E2000000E30000008400050003000000E50000003C000000150000008000050003000000E6000000E4000000E50000008400050003000000E700000034000000250000008000050003000000E8000000E6000000E70000008400050003000000E90000003D000000250000008000050003000000EA000000E8000000E90000008000050003000000EB000000EA0000001F0000004100060069000000EC000000100000002A000000EB0000003E000300EC000000DD0000008000050003000000ED000000EA000000280000004100060069000000EE000000100000002A000000ED0000003E000300EE000000DE0000008000050003000000EF000000EA000000250000004100060069000000F0000000100000002A000000EF0000003E000300F0000000DF0000004100060069000000F1000000100000002A000000EA0000003E000300F1000000E00000008000050003000000F2000000440000002B000000F900020041000000F800020043000000FD0001003800010008000C0004000800"> : vector<5744xi8>, format = "vulkan-spirv-fb", mime_type = "application/x-flatbuffers"}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeExecutablesPass //----- //
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.binary public @vulkan_spirv_fb attributes {data = dense<"0x0800000053505645A0E9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30008D050000030223070000010016000000F30000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000B0006008C000000474C534C2E7374642E343530000000000E00030000000000010000000F0009000500000013000000636F6E765F64697370617463685F3000050000000400000010000600130000001100000008000000020000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050007000A0000005F5F7265736F757263655F7661725F305F305F00050007000F0000005F5F7265736F757263655F7661725F305F315F0005000700100000005F5F7265736F757263655F7661725F305F325F000500060013000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000040000004800050007000000000000002300000000000000470003000700000002000000470004000A0000002100000000000000470004000A0000002200000000000000470004000D0000000600000010000000480005000C000000000000002300000000000000470003000C00000002000000470004000F0000002100000001000000470004000F0000002200000000000000470003000C0000000200000047000400100000002100000002000000470004001000000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B0004000100000005000000010000001600030009000000200000001D00030008000000090000001E000300070000000800000020000400060000000C000000070000003B000400060000000A0000000C000000170004000E00000009000000040000001D0003000D0000000E0000001E0003000C0000000D000000200004000B0000000C0000000C0000003B0004000B0000000F0000000C0000003B0004000B000000100000000C00000013000200120000002100030011000000120000002B0004000300000015000000800300002B0004000300000016000000140000002B0004000300000017000000130000002B0004000300000018000000120000002B00040003000000190000000E0000002B000400030000001A0000000D0000002B000400030000001B0000000C0000002B000400030000001C000000070000002B000400030000001D000000060000002B000400030000001E000000A30200002B000400030000001F000000180000002B0004000300000020000000480000002B0004000900000022000000000000002C0007000E0000002100000022000000220000002200000022000000170004002300000009000000030000002C00060023000000240000002200000022000000220000002B0004000300000025000000080000002B0004000300000026000000020000002B0004000300000027000000040000002B0004000300000028000000100000002B0004000300000029000000200000002B000400030000002A000000000000002B000400030000002B000000700000002B000400030000002C000000030000002B000400030000002D0000000100000014000200460000002000040049000000070000000E00000020000400690000000C0000000E000000200004007D0000000C000000090000003600050012000000130000000000000011000000F8000200140000003B000400490000004A000000070000003B000400490000004B000000070000003B000400490000004C000000070000003B000400490000004D000000070000003B0004004900000057000000070000003B0004004900000058000000070000003B0004004900000059000000070000003B000400490000005A000000070000003D000400020000002E0000000500000051000500030000002F0000002E000000000000003D00040002000000300000000500000051000500030000003100000030000000010000003D000400020000003200000005000000510005000300000033000000320000000200000084000500030000003400000031000000250000008400050003000000350000002F0000002900000084000500030000003600000031000000280000003D00040002000000370000000400000051000500030000003800000037000000000000003D00040002000000390000000400000051000500030000003A00000039000000010000003D000400020000003B0000000400000051000500030000003C0000003B0000000200000084000500030000003D0000003A0000002700000084000500030000003E000000380000002700000084000500030000003F0000003C000000260000008400050003000000400000003A00000025000000F900020041000000F800020041000000F50007000300000044000000F2000000450000003300000014000000B10005004600000047000000440000002B000000F6000400430000004200000000000000FA000400470000004200000043000000F8000200420000008400050003000000480000004400000026000000F90002004E000000F80002004E000000F50007000300000050000000DC000000510000002A00000042000000F50007000E00000052000000DB000000510000002100000042000000F50007000E00000053000000DA000000510000002100000042000000F50007000E00000054000000D9000000510000002100000042000000F50007000E00000055000000D8000000510000002100000042000000B10005004600000056000000500000002C000000F6000400450000004F00000000000000FA000400560000004F00000045000000F80002004F000000F90002005B000000F80002005B000000F5000700030000005D000000D70000005C0000002A0000004F000000F50007000E0000005E000000920000005C000000520000004F000000F50007000E0000005F000000AC0000005C000000530000004F000000F50007000E00000060000000C10000005C000000540000004F000000F50007000E00000061000000D60000005C000000550000004F000000B100050046000000620000005D0000002C000000F6000400510000005C00000000000000FA000400620000005C00000051000000F80002005C000000800005000300000063000000350000003E000000870005000300000064000000630000002700000084000500030000006500000050000000200000008400050003000000660000005D0000001F0000008000050003000000670000006500000066000000800005000300000068000000670000006400000041000600690000006A0000000F0000002A000000680000003D0004000E0000006B0000006A00000080000500030000006C000000680000002500000041000600690000006D0000000F0000002A0000006C0000003D0004000E0000006E0000006D00000080000500030000006F00000068000000280000004100060069000000700000000F0000002A0000006F0000003D0004000E0000007100000070000000840005000300000072000000500000001E0000008400050003000000730000005D0000002C0000008000050003000000740000007200000073000000840005000300000075000000480000001E00000080000500030000007600000074000000750000008400050003000000770000003F0000001E0000008000050003000000780000007600000077000000840005000300000079000000360000002C00000080000500030000007A000000780000007900000084000500030000007B000000400000002C00000080000500030000007C0000007A0000007B000000410006007D0000007E0000000A0000002A0000007C0000003D000400090000007F0000007E0000005200060023000000800000007F00000024000000000000008000050003000000810000007C0000002D000000410006007D000000820000000A0000002A000000810000003D0004000900000083000000820000005200060023000000840000008300000080000000010000008000050003000000850000007C00000026000000410006007D000000860000000A0000002A000000850000003D0004000900000087000000860000005200060023000000880000008700000084000000020000005100050009000000890000008800000000000000500007000E0000008A000000890000008900000089000000890000000C0008000E0000008B0000008C000000320000008A0000006B0000005E00000051000500090000008D0000008800000001000000500007000E0000008E0000008D0000008D0000008D0000008D0000000C0008000E0000008F0000008C000000320000008E0000006E0000008B0000005100050009000000900000008800000002000000500007000E00000091000000900000009000000090000000900000000C0008000E000000920000008C0000003200000091000000710000008F000000800005000300000093000000720000007500000080000500030000009400000093000000770000008000050003000000950000009400000079000000800005000300000096000000950000007B0000008000050003000000970000009600000073000000800005000300000098000000970000001D000000410006007D000000990000000A0000002A000000980000003D000400090000009A0000009900000052000600230000009B0000009A000000240000000000000080000500030000009C000000970000001C000000410006007D0000009D0000000A0000002A0000009C0000003D000400090000009E0000009D00000052000600230000009F0000009E0000009B000000010000008000050003000000A00000009700000025000000410006007D000000A10000000A0000002A000000A00000003D00040009000000A2000000A10000005200060023000000A3000000A20000009F000000020000005100050009000000A4000000A300000000000000500007000E000000A5000000A4000000A4000000A4000000A40000000C0008000E000000A60000008C00000032000000A50000006B0000005F0000005100050009000000A7000000A300000001000000500007000E000000A8000000A7000000A7000000A7000000A70000000C0008000E000000A90000008C00000032000000A80000006E000000A60000005100050009000000AA000000A300000002000000500007000E000000AB000000AA000000AA000000AA000000AA0000000C0008000E000000AC0000008C00000032000000AB00000071000000A90000008000050003000000AD000000970000001B000000410006007D000000AE0000000A0000002A000000AD0000003D00040009000000AF000000AE0000005200060023000000B0000000AF00000024000000000000008000050003000000B1000000970000001A000000410006007D000000B20000000A0000002A000000B10000003D00040009000000B3000000B20000005200060023000000B4000000B3000000B0000000010000008000050003000000B50000009700000019000000410006007D000000B60000000A0000002A000000B50000003D00040009000000B7000000B60000005200060023000000B8000000B7000000B4000000020000005100050009000000B9000000B800000000000000500007000E000000BA000000B9000000B9000000B9000000B90000000C0008000E000000BB0000008C00000032000000BA0000006B000000600000005100050009000000BC000000B800000001000000500007000E000000BD000000BC000000BC000000BC000000BC0000000C0008000E000000BE0000008C00000032000000BD0000006E000000BB0000005100050009000000BF000000B800000002000000500007000E000000C0000000BF000000BF000000BF000000BF0000000C0008000E000000C10000008C00000032000000C000000071000000BE0000008000050003000000C20000009700000018000000410006007D000000C30000000A0000002A000000C20000003D00040009000000C4000000C30000005200060023000000C5000000C400000024000000000000008000050003000000C60000009700000017000000410006007D000000C70000000A0000002A000000C60000003D00040009000000C8000000C70000005200060023000000C9000000C8000000C5000000010000008000050003000000CA0000009700000016000000410006007D000000CB0000000A0000002A000000CA0000003D00040009000000CC000000CB0000005200060023000000CD000000CC000000C9000000020000005100050009000000CE000000CD00000000000000500007000E000000CF000000CE000000CE000000CE000000CE0000000C0008000E000000D00000008C00000032000000CF0000006B000000610000005100050009000000D1000000CD00000001000000500007000E000000D2000000D1000000D1000000D1000000D10000000C0008000E000000D30000008C00000032000000D20000006E000000D00000005100050009000000D4000000CD00000002000000500007000E000000D5000000D4000000D4000000D4000000D40000000C0008000E000000D60000008C00000032000000D500000071000000D30000003E00030057000000920000003E00030058000000AC0000003E00030059000000C10000003E0003005A000000D60000008000050003000000D70000005D0000002D000000F90002005B000000F8000200510000003D0004000E000000D80000005A0000003D0004000E000000D9000000590000003D0004000E000000DA000000580000003D0004000E000000DB000000570000003E0003004A000000DB0000003E0003004B000000DA0000003E0003004C000000D90000003E0003004D000000D80000008000050003000000DC000000500000002D000000F90002004E000000F8000200450000003D0004000E000000DD0000004D0000003D0004000E000000DE0000004C0000003D0004000E000000DF0000004B0000003D0004000E000000E00000004A0000008000050003000000E1000000350000003E0000008700050003000000E2000000E1000000270000008400050003000000E300000044000000150000008000050003000000E4000000E2000000E30000008400050003000000E50000003C000000150000008000050003000000E6000000E4000000E50000008400050003000000E700000034000000250000008000050003000000E8000000E6000000E70000008400050003000000E90000003D000000250000008000050003000000EA000000E8000000E90000008000050003000000EB000000EA0000001F0000004100060069000000EC000000100000002A000000EB0000003E000300EC000000DD0000008000050003000000ED000000EA000000280000004100060069000000EE000000100000002A000000ED0000003E000300EE000000DE0000008000050003000000EF000000EA000000250000004100060069000000F0000000100000002A000000EF0000003E000300F0000000DF0000004100060069000000F1000000100000002A000000EA0000003E000300F1000000E00000008000050003000000F2000000440000002B000000F900020041000000F800020043000000FD0001003800010008000C0004000800"> : vector<5744xi8>, format = "vulkan-spirv-fb", mime_type = "application/x-flatbuffers"}
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
%_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
%_device_query_0 = util.global.load @_device_query_0 : i1
%_executable_conv_dispatch_0 = util.global.load @_executable_conv_dispatch_0 : !hal.executable
%c1_i32 = constant 1 : i32
%c50331680_i32 = constant 50331680 : i32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c14 = constant 14 : index
%c2 = constant 2 : index
%c0 = constant 0 : index
%c1605632 = constant 1605632 : index
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
%buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
%buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
cond_br %_device_query_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%buffer : !hal.buffer)[%c0, %c607500],
%c1 = (%buffer_0 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer_1 : !hal.buffer)[%c0, %c1605632]
])
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_conv_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c14, %c112])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
%view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c1, %c112, %c112, %c32]) type(%c50331680_i32) encoding(%c1_i32) : !hal.buffer_view
return %view : !hal.buffer_view
^bb2: // pred: ^bb0
util.unreachable "device not supported in the compiled configuration"
}
// -----// IR Dump After SymbolDCE //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_po
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment