Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Last active September 19, 2021 21:14
Embed
What would you like to do?
This file has been truncated, but you can view the full file.
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
}
// -----// IR Dump After Canonicalizer //----- //
func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Inliner //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After TopLevelSCFToCFG //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ShapeToShapeLowering //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertShapeToStandard //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Inliner //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After LegalizeInputTypes //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertMHLOToLinalgExt //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%cst = constant 0.000000e+00 : f32
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After IREEImportPublic //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After VerifyInputLegality //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After ExpandGlobalDynamicDims //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionDynamicDimsPass //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After PadTensorToSubTensorInsert //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertElementwiseToLinalg //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After LinalgFoldUnitExtentDims //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After InterchangeGenericOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After FusionOfTensorOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = constant 0.000000e+00 : f32
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %5 : !hal.buffer_view
}
// -----// IR Dump After DispatchLinalgOnTensors //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112_0 = constant 112 : index
%c32_1 = constant 32 : index
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %5 to %c112_0 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %7 to %c112_0 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %9 to %c32_1 step %10 {
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112_0 = constant 112 : index
%c32_1 = constant 32 : index
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %5 to %c112_0 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %7 to %c112_0 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %9 to %c32_1 step %10 {
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112_0 = constant 112 : index
%c32_1 = constant 32 : index
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %5 to %c112_0 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %7 to %c112_0 step %8 {
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %9 to %c32_1 step %10 {
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32_0 = constant 32 : index
%c112_1 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %4 to %c112_1 step %5 {
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %6 to %c112_1 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %8 to %c32_0 step %9 {
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%11 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%13 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6)
%14 = flow.dispatch.tensor.load %arg2, offsets = [0, %10, %12, 0], sizes = [1, %11, %13, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%16 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %15], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%17 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5)
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6)
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7)
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2)
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1)
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0)
%23 = linalg.init_tensor [1, %20, %21, %22] : tensor<1x?x?x?xf32>
%24 = linalg.fill(%cst, %23) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%25 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%14, %16 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%24 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %25, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %17, %18, %19], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After OutlineDispatchRegions //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After DeduplicateExecutables //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After HoistUnstreamableOps //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%c32 = constant 32 : index
%c112 = constant 112 : index
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After InsertConstantClones //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After FormStreams //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After OutlineLargeConstants //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After CSE //----- //
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
// -----// IR Dump After SymbolDCE //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3)
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3)
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4)
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5)
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2)
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1)
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0)
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::IdentifyConstantPoolsPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeConstantPoolBuffersPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
flow.executable private @conv_dispatch_0 {
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index}
builtin.module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4)
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2]
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1]
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2]
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1]
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c32 = constant 32 : index
%c112 = constant 112 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer //----- //
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} {
hal.executable private @conv_dispatch_0 {
hal.interface public @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32>
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32>
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %4 : tensor<1x112x112x32xf32>
}
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view
return %3 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
builtin.module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32>
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After LinalgBufferize //----- //
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = flow.dispatch.tensor.load %1, offsets = [0, %12, %14, 0], sizes = [1, %13, %15, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %18] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%20 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %18], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%22 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%23 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%24 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z]
%25 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%26 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%27 = linalg.init_tensor [1, %24, %25, %26] : tensor<1x?x?x?xf32>
%28 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%29 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%30 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%31 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %28, %29, %30] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %31) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = linalg.fill(%cst, %27) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %19 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%31 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
%33 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%17, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%32 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
}
}
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDims //----- //
module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After Canonicalizer //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After CSE //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %17] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %21) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%21 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y]
%13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %18) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After SetNumWorkgroups //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c112 = constant 112 : index
%c14 = constant 14 : index
%c1 = constant 1 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%c8 = constant 8 : index
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c1]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %c1]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c8]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c8]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c32]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c32]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (3, d0 * -2 + 227)>(%arg0)[%c1]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (17, d0 * -2 + 227)>(%arg1)[%c8]
%13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%14 = affine.min affine_map<(d0)[s0] -> (32, -d0 + 32)>(%arg2)[%c32]
%15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%16 = affine.min affine_map<(d0)[s0] -> (1, -d0 + 112)>(%arg0)[%c1]
%17 = affine.min affine_map<(d0)[s0] -> (8, -d0 + 112)>(%arg1)[%c8]
%18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %18) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After Canonicalizer //----- //
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> {
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c1 = constant 1 : index
%c14 = constant 14 : index
%c112 = constant 112 : index
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module {
func @conv_dispatch_0() {
%cst = constant 0.000000e+00 : f32
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg1 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %5 to %c32 step %6 {
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg1 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %5 to %c32 step %6 {
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After CSE //----- //
module {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c32 = constant 32 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
scf.for %arg1 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %5 to %c32 step %6 {
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1)
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2)
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1)
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After SPIRVRemoveOneTripTiledLoop //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%6 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0)
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%3)
%8 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%3)
%9 = memref.subview %0[0, %5, %7, 0] [1, %6, %8, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%10 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%4)
%11 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, %10] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%12 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0)
%13 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%3)
%14 = memref.subview %2[0, %arg0, %3, %4] [1, %12, %13, %10] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %14) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%9, %11 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%14 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
return
}
// -----// IR Dump After SPIRVTileAndDistribute //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c112 = constant 112 : index
%cst = constant 0.000000e+00 : f32
%c3 = constant 3 : index
%c1 = constant 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%7 = memref.subview %0[0, %5, %6, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%8 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%9 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%10 = "gpu.thread_id"() {dimension = "x"} : () -> index
%11 = "gpu.thread_id"() {dimension = "y"} : () -> index
%12 = "gpu.thread_id"() {dimension = "z"} : () -> index
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%10]
%15 = memref.subview %9[0, %12, %13, %14] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%cst, %15) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%16 = "gpu.thread_id"() {dimension = "x"} : () -> index
%17 = "gpu.thread_id"() {dimension = "y"} : () -> index
%18 = "gpu.thread_id"() {dimension = "z"} : () -> index
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17]
%23 = memref.subview %7[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%24 = memref.subview %8[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = memref.subview %9[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
scf.for %arg1 = %c0 to %c3 step %c1 {
scf.for %arg2 = %c0 to %c3 step %c1 {
%26 = memref.subview %23[0, %arg1, %arg2, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = memref.subview %24[%arg1, %arg2, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "vectorize", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%26, %27 : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%25 : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// -----// IR Dump After SPIRVVectorize //----- //
func @conv_dispatch_0() {
%c0 = constant 0 : index
%c112 = constant 112 : index
%c3 = constant 3 : index
%c1 = constant 1 : index
%cst = constant dense<0.000000e+00> : vector<1x1x4x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%cst_1 = constant dense<0.000000e+00> : vector<1x4xf32>
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = vector.extract_strided_slice %cst {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%13 = vector.extract_strided_slice %cst {offsets = [0, 0, 1, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%14 = vector.extract_strided_slice %cst {offsets = [0, 0, 2, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%15 = vector.extract_strided_slice %cst {offsets = [0, 0, 3, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32>
%16 = "gpu.thread_id"() {dimension = "x"} : () -> index
%17 = "gpu.thread_id"() {dimension = "y"} : () -> index
%18 = "gpu.thread_id"() {dimension = "z"} : () -> index
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17]
%23 = memref.subview %6[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%24 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%25 = memref.subview %0[0, %24, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%26 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%27 = memref.subview %26[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %12, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %13, %27[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %14, %27[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %15, %27[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%28 = memref.subview %25[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%29 = memref.subview %26[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%30 = vector.transfer_read %29[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %29[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%32 = vector.transfer_read %29[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%33 = vector.transfer_read %29[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%34:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %30, %arg3 = %31, %arg4 = %32, %arg5 = %33) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%35:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%36 = memref.subview %28[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%37 = memref.subview %23[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%38 = vector.transfer_read %37[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%39 = vector.transfer_read %37[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%40 = vector.transfer_read %37[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%41 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%42 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%43 = vector.extract %38[0] : vector<1x4xf32>
%44 = vector.extract %42[0, 0] : vector<1x1xf32>
%45 = splat %44 : vector<4xf32>
%46 = vector.extract %arg7[0] : vector<1x4xf32>
%47 = vector.fma %45, %43, %46 : vector<4xf32>
%48 = vector.extract_strided_slice %41 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%49 = vector.extract %39[0] : vector<1x4xf32>
%50 = vector.extract %48[0, 0] : vector<1x1xf32>
%51 = splat %50 : vector<4xf32>
%52 = vector.fma %51, %49, %47 : vector<4xf32>
%53 = vector.extract_strided_slice %41 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%54 = vector.extract %40[0] : vector<1x4xf32>
%55 = vector.extract %53[0, 0] : vector<1x1xf32>
%56 = splat %55 : vector<4xf32>
%57 = vector.fma %56, %54, %52 : vector<4xf32>
%58 = vector.insert %57, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%59 = vector.transfer_read %36[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%60 = vector.extract_strided_slice %59 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.extract %38[0] : vector<1x4xf32>
%62 = vector.extract %60[0, 0] : vector<1x1xf32>
%63 = splat %62 : vector<4xf32>
%64 = vector.extract %arg8[0] : vector<1x4xf32>
%65 = vector.fma %63, %61, %64 : vector<4xf32>
%66 = vector.extract_strided_slice %59 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%67 = vector.extract %39[0] : vector<1x4xf32>
%68 = vector.extract %66[0, 0] : vector<1x1xf32>
%69 = splat %68 : vector<4xf32>
%70 = vector.fma %69, %67, %65 : vector<4xf32>
%71 = vector.extract_strided_slice %59 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%72 = vector.extract %40[0] : vector<1x4xf32>
%73 = vector.extract %71[0, 0] : vector<1x1xf32>
%74 = splat %73 : vector<4xf32>
%75 = vector.fma %74, %72, %70 : vector<4xf32>
%76 = vector.insert %75, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%77 = vector.transfer_read %36[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%78 = vector.extract_strided_slice %77 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%79 = vector.extract %38[0] : vector<1x4xf32>
%80 = vector.extract %78[0, 0] : vector<1x1xf32>
%81 = splat %80 : vector<4xf32>
%82 = vector.extract %arg9[0] : vector<1x4xf32>
%83 = vector.fma %81, %79, %82 : vector<4xf32>
%84 = vector.extract_strided_slice %77 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%85 = vector.extract %39[0] : vector<1x4xf32>
%86 = vector.extract %84[0, 0] : vector<1x1xf32>
%87 = splat %86 : vector<4xf32>
%88 = vector.fma %87, %85, %83 : vector<4xf32>
%89 = vector.extract_strided_slice %77 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%90 = vector.extract %40[0] : vector<1x4xf32>
%91 = vector.extract %89[0, 0] : vector<1x1xf32>
%92 = splat %91 : vector<4xf32>
%93 = vector.fma %92, %90, %88 : vector<4xf32>
%94 = vector.insert %93, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
%95 = vector.transfer_read %36[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%96 = vector.extract_strided_slice %95 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%97 = vector.extract %38[0] : vector<1x4xf32>
%98 = vector.extract %96[0, 0] : vector<1x1xf32>
%99 = splat %98 : vector<4xf32>
%100 = vector.extract %arg10[0] : vector<1x4xf32>
%101 = vector.fma %99, %97, %100 : vector<4xf32>
%102 = vector.extract_strided_slice %95 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%103 = vector.extract %39[0] : vector<1x4xf32>
%104 = vector.extract %102[0, 0] : vector<1x1xf32>
%105 = splat %104 : vector<4xf32>
%106 = vector.fma %105, %103, %101 : vector<4xf32>
%107 = vector.extract_strided_slice %95 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%108 = vector.extract %40[0] : vector<1x4xf32>
%109 = vector.extract %107[0, 0] : vector<1x1xf32>
%110 = splat %109 : vector<4xf32>
%111 = vector.fma %110, %108, %106 : vector<4xf32>
%112 = vector.insert %111, %cst_1 [0] : vector<4xf32> into vector<1x4xf32>
scf.yield %58, %76, %94, %112 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %35#0, %35#1, %35#2, %35#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %34#3, %29[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %34#2, %29[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %34#1, %29[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %34#0, %29[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
// -----// IR Dump After Canonicalizer //----- //
module {
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c6 = constant 6 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>
%83 = splat %82 : vector<4xf32>
%84 = vector.fma %83, %81, %79 : vector<4xf32>
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%87 = vector.extract %85[0, 0] : vector<1x1xf32>
%88 = splat %87 : vector<4xf32>
%89 = vector.fma %88, %86, %84 : vector<4xf32>
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32>
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%94 = vector.extract %92[0, 0] : vector<1x1xf32>
%95 = splat %94 : vector<4xf32>
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32>
%97 = vector.fma %95, %93, %96 : vector<4xf32>
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%100 = vector.extract %98[0, 0] : vector<1x1xf32>
%101 = splat %100 : vector<4xf32>
%102 = vector.fma %101, %99, %97 : vector<4xf32>
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%105 = vector.extract %103[0, 0] : vector<1x1xf32>
%106 = splat %105 : vector<4xf32>
%107 = vector.fma %106, %104, %102 : vector<4xf32>
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
return
}
hal.interface private @io {
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// -----// IR Dump After SPIRVCopyToWorkgroupMemory //----- //
func @conv_dispatch_0() {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%cst_0 = constant 0.000000e+00 : f32
%c6 = constant 6 : index
%c2 = constant 2 : index
%c4 = constant 4 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8]
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7]
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12]
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14]
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13]
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 {
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%40 = vector.extract %38[0, 0] : vector<1x1xf32>
%41 = splat %40 : vector<4xf32>
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%43 = vector.fma %41, %39, %42 : vector<4xf32>
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%46 = vector.extract %44[0, 0] : vector<1x1xf32>
%47 = splat %46 : vector<4xf32>
%48 = vector.fma %47, %45, %43 : vector<4xf32>
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%51 = vector.extract %49[0, 0] : vector<1x1xf32>
%52 = splat %51 : vector<4xf32>
%53 = vector.fma %52, %50, %48 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%58 = vector.extract %56[0, 0] : vector<1x1xf32>
%59 = splat %58 : vector<4xf32>
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%61 = vector.fma %59, %57, %60 : vector<4xf32>
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%64 = vector.extract %62[0, 0] : vector<1x1xf32>
%65 = splat %64 : vector<4xf32>
%66 = vector.fma %65, %63, %61 : vector<4xf32>
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%69 = vector.extract %67[0, 0] : vector<1x1xf32>
%70 = splat %69 : vector<4xf32>
%71 = vector.fma %70, %68, %66 : vector<4xf32>
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32>
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32>
%76 = vector.extract %74[0, 0] : vector<1x1xf32>
%77 = splat %76 : vector<4xf32>
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%79 = vector.fma %77, %75, %78 : vector<4xf32>
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32>
%82 = vector.extract %80[0, 0] : vector<1x1xf32>