This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
return %0 : tensor<1x112x112x32xf32> | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func private @_conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
return %0 : tensor<1x112x112x32xf32> | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = call @_conv(%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After SymbolDCE //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After TopLevelSCFToCFG //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After ShapeToShapeLowering //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertShapeToStandard //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MHLOToMHLOPreprocessing //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After LegalizeInputTypes //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertMHLOToLinalgExt //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%cst = constant 0.000000e+00 : f32 | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After VerifyCompilerMHLOInputLegality //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IREEImportPublic //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After VerifyInputLegality //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ExpandGlobalDynamicDims //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionDynamicDimsPass //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After PadTensorToSubTensorInsert //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalg //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgFoldUnitExtentDims //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After InterchangeGenericOps //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After FusionOfTensorOps //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertToFlowBeforeDispatchFormation //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%3 = linalg.fill(%cst, %2) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> | |
%4 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%0, %1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%3 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> | |
%5 = hal.tensor.cast %4 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %5 : !hal.buffer_view | |
} | |
// -----// IR Dump After DispatchLinalgOnTensors //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112_0 = constant 112 : index | |
%c32_1 = constant 32 : index | |
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg5 = %5 to %c112_0 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg6 = %7 to %c112_0 step %8 { | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg7 = %9 to %c32_1 step %10 { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5) | |
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6) | |
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6) | |
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5) | |
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6) | |
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2) | |
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1) | |
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0) | |
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32> | |
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
flow.return | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims //----- // | |
module { | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112_0 = constant 112 : index | |
%c32_1 = constant 32 : index | |
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg5 = %5 to %c112_0 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg6 = %7 to %c112_0 step %8 { | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg7 = %9 to %c32_1 step %10 { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5) | |
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6) | |
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6) | |
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5) | |
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6) | |
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2) | |
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1) | |
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0) | |
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32> | |
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
flow.return | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertToFlowAfterDispatchFormation //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112_0 = constant 112 : index | |
%c32_1 = constant 32 : index | |
%4 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg5 = %5 to %c112_0 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg6 = %7 to %c112_0 step %8 { | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg7 = %9 to %c32_1 step %10 { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5) | |
%12 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5) | |
%13 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6) | |
%14 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6) | |
%15 = flow.dispatch.tensor.load %arg2, offsets = [0, %11, %13, 0], sizes = [1, %12, %14, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%16 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%17 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5) | |
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6) | |
%20 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2) | |
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1) | |
%23 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0) | |
%24 = tensor.extract_slice %4[0, %arg5, %arg6, %arg7] [1, %21, %22, %23] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32> | |
%25 = linalg.fill(%cst, %24) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%26 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%15, %17 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%25 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %26, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %18, %19, %20], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
flow.return | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch.workgroups[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%c32_0 = constant 32 : index | |
%c112_1 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg5 = %4 to %c112_1 step %5 { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg6 = %6 to %c112_1 step %7 { | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg7 = %8 to %c32_0 step %9 { | |
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5) | |
%11 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg5) | |
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6) | |
%13 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg6) | |
%14 = flow.dispatch.tensor.load %arg2, offsets = [0, %10, %12, 0], sizes = [1, %11, %13, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%16 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, %arg7], sizes = [3, 3, 3, %15], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%17 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg5) | |
%18 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg6) | |
%19 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg7) | |
%20 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg5, %workgroup_size_2) | |
%21 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg6, %workgroup_size_1) | |
%22 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg7, %workgroup_size_0) | |
%23 = linalg.init_tensor [1, %20, %21, %22] : tensor<1x?x?x?xf32> | |
%24 = linalg.fill(%cst, %23) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%25 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%14, %16 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%24 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %25, %arg4, offsets = [0, %arg5, %arg6, %arg7], sizes = [1, %17, %18, %19], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
flow.return | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineDispatchRegions //----- // | |
module { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3) | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4) | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3) | |
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4) | |
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2) | |
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1) | |
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0) | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After DeduplicateExecutables //----- // | |
module { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3) | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4) | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3) | |
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4) | |
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2) | |
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1) | |
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0) | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%1 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%0, %1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After HoistUnstreamableOps //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After InsertConstantClones //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormStreams //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineLargeConstants //----- // | |
module { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3) | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4) | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3) | |
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4) | |
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2) | |
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1) | |
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0) | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%c32, %c112, %1, %0) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
// -----// IR Dump After SymbolDCE //----- // | |
module { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_2, %arg3) | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 227)>(%workgroup_size_1, %arg4) | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_2, %arg3) | |
%14 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 112)>(%workgroup_size_1, %arg4) | |
%15 = affine.min affine_map<(d0, d1) -> (d0, -d1 + 32)>(%workgroup_size_0, %arg5) | |
%16 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg3, %workgroup_size_2) | |
%17 = affine.min affine_map<(d0, d1) -> (-d0 + 112, d1)>(%arg4, %workgroup_size_1) | |
%18 = affine.min affine_map<(d0, d1) -> (-d0 + 32, d1)>(%arg5, %workgroup_size_0) | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
module { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2] | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1] | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2] | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1] | |
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2] | |
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1] | |
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0] | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2] | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1] | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2] | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1] | |
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2] | |
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1] | |
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0] | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2] | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1] | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2] | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1] | |
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2] | |
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1] | |
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0] | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::IdentifyConstantPoolsPass //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2] | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1] | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2] | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1] | |
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2] | |
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1] | |
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0] | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeConstantPoolBuffersPass //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2] | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1] | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2] | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1] | |
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2] | |
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1] | |
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0] | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2] | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1] | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2] | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1] | |
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2] | |
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1] | |
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0] | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SymbolDCE //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
flow.executable private @conv_dispatch_0 { | |
flow.dispatch.entry public @conv_dispatch_0 attributes {workgroup_rank = 3 : index} | |
builtin.module { | |
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index | |
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index | |
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index | |
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index | |
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index | |
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index | |
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index | |
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index | |
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index | |
%0 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2] | |
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2] | |
scf.for %arg3 = %0 to %c112 step %1 { | |
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1] | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1] | |
scf.for %arg4 = %2 to %c112 step %3 { | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0] | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0] | |
scf.for %arg5 = %4 to %c32 step %5 { | |
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3) | |
%7 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg3)[%workgroup_size_2] | |
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg4) | |
%9 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg4)[%workgroup_size_1] | |
%10 = flow.dispatch.tensor.load %arg0, offsets = [0, %6, %8, 0], sizes = [1, %7, %9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%12 = flow.dispatch.tensor.load %arg1, offsets = [0, 0, 0, %arg5], sizes = [3, 3, 3, %11], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg3)[%workgroup_size_2] | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg4)[%workgroup_size_1] | |
%15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg5)[%workgroup_size_0] | |
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg3)[%workgroup_size_2] | |
%17 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg4)[%workgroup_size_1] | |
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg5)[%workgroup_size_0] | |
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32> | |
%20 = linalg.fill(%cst, %19) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%21 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %21, %arg2, offsets = [0, %arg3, %arg4, %arg5], sizes = [1, %13, %14, %15], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
hal.executable private @conv_dispatch_0 { | |
hal.interface public @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> { | |
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index} | |
builtin.module { | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %5 to %c112 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %7 to %c32 step %8 { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z] | |
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y] | |
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x] | |
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32> | |
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {buffer_constraints = #hal.buffer_constraints<max_allocation_size = 1073741824, min_buffer_offset_alignment = 256, max_buffer_range = 134217728, min_buffer_range_alignment = 16>, executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}>]}>]} { | |
hal.executable private @conv_dispatch_0 { | |
hal.interface public @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> { | |
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index} | |
builtin.module { | |
func @conv_dispatch_0() { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%c0 = constant 0 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %5 to %c112 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %7 to %c32 step %8 { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z] | |
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y] | |
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x] | |
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32> | |
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
} | |
} | |
func @conv(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%0 = hal.tensor.cast %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> | |
%1 = hal.tensor.cast %arg0 : !hal.buffer_view -> tensor<1x225x225x3xf32> | |
%2 = flow.ex.stream.fragment(%1, %0) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> = | |
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> { | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%4 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) {hal.bindings = [#hal.ex.operand_buffer<"s0b0_ro_external", 0 : index>, #hal.ex.operand_buffer<"s0b1_ro_external", 1 : index>, #hal.ex.result_buffer<"s0b2_xw_external", 0 : index>]} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> | |
flow.return %4 : tensor<1x112x112x32xf32> | |
} | |
%3 = hal.tensor.cast %2 : tensor<1x112x112x32xf32> -> !hal.buffer_view | |
return %3 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::PropagateConstantWorkgroupInfoPass //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> { | |
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index} | |
builtin.module { | |
func @conv_dispatch_0() { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%c0 = constant 0 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %5 to %c112 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %7 to %c32 step %8 { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %14], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z] | |
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y] | |
%21 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x] | |
%22 = linalg.init_tensor [1, %19, %20, %21] : tensor<1x?x?x?xf32> | |
%23 = linalg.fill(%cst, %22) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%23 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %16, %17, %18], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
} | |
// -----// IR Dump After LinalgBufferize //----- // | |
func @conv_dispatch_0() { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%c0 = constant 0 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> | |
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %6 to %c112 step %7 { | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %8 to %c112 step %9 { | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %10 to %c32 step %11 { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%17 = flow.dispatch.tensor.load %1, offsets = [0, %12, %14, 0], sizes = [1, %13, %15, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32> | |
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%19 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %18] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%20 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %18], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32> | |
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%22 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%23 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%24 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg0)[%workgroup_size_z] | |
%25 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y] | |
%26 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x] | |
%27 = linalg.init_tensor [1, %24, %25, %26] : tensor<1x?x?x?xf32> | |
%28 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%29 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%30 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%31 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %28, %29, %30] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %31) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%32 = linalg.fill(%cst, %27) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %19 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%31 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
%33 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%17, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%32 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32> | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims //----- // | |
module { | |
func @conv_dispatch_0() { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%c0 = constant 0 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> | |
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %6 to %c112 step %7 { | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %8 to %c112 step %9 { | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %10 to %c32 step %11 { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> | |
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %6 to %c112 step %7 { | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %8 to %c112 step %9 { | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %10 to %c32 step %11 { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %22) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CSE //----- // | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%3 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> | |
%4 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%5 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %6 to %c112 step %7 { | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %8 to %c112 step %9 { | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %10 to %c32 step %11 { | |
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%21 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %17] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %21) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%21 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocView //----- // | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_size_x = hal.interface.workgroup.size[0] : index | |
%workgroup_size_y = hal.interface.workgroup.size[1] : index | |
%workgroup_size_z = hal.interface.workgroup.size[2] : index | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z] | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z] | |
scf.for %arg0 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] | |
scf.for %arg1 = %5 to %c112 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] | |
scf.for %arg2 = %7 to %c32 step %8 { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg0)[%workgroup_size_z] | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 227)>(%arg1)[%workgroup_size_y] | |
%13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x] | |
%15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z] | |
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y] | |
%18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %18) : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After SetNumWorkgroups //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> { | |
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} { | |
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
%c112 = constant 112 : index | |
%c14 = constant 14 : index | |
%c1 = constant 1 : index | |
hal.return %c1, %c14, %c112 : index, index, index | |
} | |
builtin.module { | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%c8 = constant 8 : index | |
%c1 = constant 1 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c1] | |
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %c1] | |
scf.for %arg0 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c8] | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c8] | |
scf.for %arg1 = %5 to %c112 step %6 { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c32] | |
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c32] | |
scf.for %arg2 = %7 to %c32 step %8 { | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%10 = affine.min affine_map<(d0)[s0] -> (3, d0 * -2 + 227)>(%arg0)[%c1] | |
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%12 = affine.min affine_map<(d0)[s0] -> (17, d0 * -2 + 227)>(%arg1)[%c8] | |
%13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%14 = affine.min affine_map<(d0)[s0] -> (32, -d0 + 32)>(%arg2)[%c32] | |
%15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%16 = affine.min affine_map<(d0)[s0] -> (1, -d0 + 112)>(%arg0)[%c1] | |
%17 = affine.min affine_map<(d0)[s0] -> (8, -d0 + 112)>(%arg1)[%c8] | |
%18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %18) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}> { | |
hal.executable.entry_point public @conv_dispatch_0 attributes {interface = @io, ordinal = 0 : index, translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8, 1]}, workgroup_size = [8 : index, 2 : index, 1 : index]} { | |
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors | |
%c1 = constant 1 : index | |
%c14 = constant 14 : index | |
%c112 = constant 112 : index | |
hal.return %c1, %c14, %c112 : index, index, index | |
} | |
builtin.module { | |
func @conv_dispatch_0() { | |
%cst = constant 0.000000e+00 : f32 | |
%c112 = constant 112 : index | |
%c32 = constant 32 : index | |
%c0 = constant 0 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
scf.for %arg1 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
scf.for %arg2 = %5 to %c32 step %6 { | |
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0) | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1) | |
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2) | |
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0) | |
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1) | |
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
module { | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
scf.for %arg1 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
scf.for %arg2 = %5 to %c32 step %6 { | |
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0) | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1) | |
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2) | |
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0) | |
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1) | |
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
// -----// IR Dump After CSE //----- // | |
module { | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c32 = constant 32 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_count_x = hal.interface.workgroup.count[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_count_y = hal.interface.workgroup.count[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%workgroup_count_z = hal.interface.workgroup.count[2] : index | |
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y] | |
scf.for %arg1 = %3 to %c112 step %4 { | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] | |
scf.for %arg2 = %5 to %c32 step %6 { | |
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%8 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0) | |
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) | |
%10 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%arg1) | |
%11 = memref.subview %0[0, %7, %9, 0] [1, %8, %10, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%12 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%arg2) | |
%13 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %12] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%14 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0) | |
%15 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%arg1) | |
%16 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %14, %15, %12] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%16 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
// -----// IR Dump After SPIRVRemoveOneTripTiledLoop //----- // | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%6 = affine.min affine_map<(d0) -> (3, d0 * -2 + 227)>(%arg0) | |
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%3) | |
%8 = affine.min affine_map<(d0) -> (17, d0 * -2 + 227)>(%3) | |
%9 = memref.subview %0[0, %5, %7, 0] [1, %6, %8, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%10 = affine.min affine_map<(d0) -> (32, -d0 + 32)>(%4) | |
%11 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, %10] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%12 = affine.min affine_map<(d0) -> (1, -d0 + 112)>(%arg0) | |
%13 = affine.min affine_map<(d0) -> (8, -d0 + 112)>(%3) | |
%14 = memref.subview %2[0, %arg0, %3, %4] [1, %12, %13, %10] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %14) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%9, %11 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%14 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
return | |
} | |
// -----// IR Dump After SPIRVTileAndDistribute //----- // | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c112 = constant 112 : index | |
%cst = constant 0.000000e+00 : f32 | |
%c3 = constant 3 : index | |
%c1 = constant 1 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 { | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y] | |
%7 = memref.subview %0[0, %5, %6, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%8 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%9 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%10 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%11 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%12 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%13 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11] | |
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%10] | |
%15 = memref.subview %9[0, %12, %13, %14] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
linalg.fill(%cst, %15) {__internal_linalg_transform__ = "vectorize", lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}} : f32, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%16 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%17 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%18 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17] | |
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16] | |
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18] | |
%22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17] | |
%23 = memref.subview %7[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%24 = memref.subview %8[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%25 = memref.subview %9[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
scf.for %arg1 = %c0 to %c3 step %c1 { | |
scf.for %arg2 = %c0 to %c3 step %c1 { | |
%26 = memref.subview %23[0, %arg1, %arg2, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%27 = memref.subview %24[%arg1, %arg2, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "vectorize", dilations = dense<1> : tensor<2xi64>, lowering.config = {tileSizes = [[0, 1, 8, 32], [], [0, 1, 4, 4]]}, strides = dense<2> : tensor<2xi64>} ins(%26, %27 : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%25 : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>) | |
} | |
} | |
} | |
return | |
} | |
// -----// IR Dump After SPIRVVectorize //----- // | |
func @conv_dispatch_0() { | |
%c0 = constant 0 : index | |
%c112 = constant 112 : index | |
%c3 = constant 3 : index | |
%c1 = constant 1 : index | |
%cst = constant dense<0.000000e+00> : vector<1x1x4x4xf32> | |
%c4 = constant 4 : index | |
%c2 = constant 2 : index | |
%c6 = constant 6 : index | |
%cst_0 = constant 0.000000e+00 : f32 | |
%cst_1 = constant dense<0.000000e+00> : vector<1x4xf32> | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y] | |
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8] | |
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7] | |
%12 = vector.extract_strided_slice %cst {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32> | |
%13 = vector.extract_strided_slice %cst {offsets = [0, 0, 1, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32> | |
%14 = vector.extract_strided_slice %cst {offsets = [0, 0, 2, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32> | |
%15 = vector.extract_strided_slice %cst {offsets = [0, 0, 3, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x4x4xf32> to vector<1x1x1x4xf32> | |
%16 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%17 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%18 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17] | |
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%16] | |
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18] | |
%22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%17] | |
%23 = memref.subview %6[0, 0, 0, %20] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 { | |
%24 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%25 = memref.subview %0[0, %24, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%26 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%27 = memref.subview %26[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %12, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %13, %27[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %14, %27[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %15, %27[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%28 = memref.subview %25[0, %21, %22, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%29 = memref.subview %26[0, %18, %19, %20] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%30 = vector.transfer_read %29[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%31 = vector.transfer_read %29[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%32 = vector.transfer_read %29[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%33 = vector.transfer_read %29[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%34:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %30, %arg3 = %31, %arg4 = %32, %arg5 = %33) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) { | |
%35:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) { | |
%36 = memref.subview %28[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%37 = memref.subview %23[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%38 = vector.transfer_read %37[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%39 = vector.transfer_read %37[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%40 = vector.transfer_read %37[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%41 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%42 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%43 = vector.extract %38[0] : vector<1x4xf32> | |
%44 = vector.extract %42[0, 0] : vector<1x1xf32> | |
%45 = splat %44 : vector<4xf32> | |
%46 = vector.extract %arg7[0] : vector<1x4xf32> | |
%47 = vector.fma %45, %43, %46 : vector<4xf32> | |
%48 = vector.extract_strided_slice %41 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%49 = vector.extract %39[0] : vector<1x4xf32> | |
%50 = vector.extract %48[0, 0] : vector<1x1xf32> | |
%51 = splat %50 : vector<4xf32> | |
%52 = vector.fma %51, %49, %47 : vector<4xf32> | |
%53 = vector.extract_strided_slice %41 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%54 = vector.extract %40[0] : vector<1x4xf32> | |
%55 = vector.extract %53[0, 0] : vector<1x1xf32> | |
%56 = splat %55 : vector<4xf32> | |
%57 = vector.fma %56, %54, %52 : vector<4xf32> | |
%58 = vector.insert %57, %cst_1 [0] : vector<4xf32> into vector<1x4xf32> | |
%59 = vector.transfer_read %36[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%60 = vector.extract_strided_slice %59 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%61 = vector.extract %38[0] : vector<1x4xf32> | |
%62 = vector.extract %60[0, 0] : vector<1x1xf32> | |
%63 = splat %62 : vector<4xf32> | |
%64 = vector.extract %arg8[0] : vector<1x4xf32> | |
%65 = vector.fma %63, %61, %64 : vector<4xf32> | |
%66 = vector.extract_strided_slice %59 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%67 = vector.extract %39[0] : vector<1x4xf32> | |
%68 = vector.extract %66[0, 0] : vector<1x1xf32> | |
%69 = splat %68 : vector<4xf32> | |
%70 = vector.fma %69, %67, %65 : vector<4xf32> | |
%71 = vector.extract_strided_slice %59 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%72 = vector.extract %40[0] : vector<1x4xf32> | |
%73 = vector.extract %71[0, 0] : vector<1x1xf32> | |
%74 = splat %73 : vector<4xf32> | |
%75 = vector.fma %74, %72, %70 : vector<4xf32> | |
%76 = vector.insert %75, %cst_1 [0] : vector<4xf32> into vector<1x4xf32> | |
%77 = vector.transfer_read %36[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%78 = vector.extract_strided_slice %77 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%79 = vector.extract %38[0] : vector<1x4xf32> | |
%80 = vector.extract %78[0, 0] : vector<1x1xf32> | |
%81 = splat %80 : vector<4xf32> | |
%82 = vector.extract %arg9[0] : vector<1x4xf32> | |
%83 = vector.fma %81, %79, %82 : vector<4xf32> | |
%84 = vector.extract_strided_slice %77 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%85 = vector.extract %39[0] : vector<1x4xf32> | |
%86 = vector.extract %84[0, 0] : vector<1x1xf32> | |
%87 = splat %86 : vector<4xf32> | |
%88 = vector.fma %87, %85, %83 : vector<4xf32> | |
%89 = vector.extract_strided_slice %77 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%90 = vector.extract %40[0] : vector<1x4xf32> | |
%91 = vector.extract %89[0, 0] : vector<1x1xf32> | |
%92 = splat %91 : vector<4xf32> | |
%93 = vector.fma %92, %90, %88 : vector<4xf32> | |
%94 = vector.insert %93, %cst_1 [0] : vector<4xf32> into vector<1x4xf32> | |
%95 = vector.transfer_read %36[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%96 = vector.extract_strided_slice %95 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%97 = vector.extract %38[0] : vector<1x4xf32> | |
%98 = vector.extract %96[0, 0] : vector<1x1xf32> | |
%99 = splat %98 : vector<4xf32> | |
%100 = vector.extract %arg10[0] : vector<1x4xf32> | |
%101 = vector.fma %99, %97, %100 : vector<4xf32> | |
%102 = vector.extract_strided_slice %95 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%103 = vector.extract %39[0] : vector<1x4xf32> | |
%104 = vector.extract %102[0, 0] : vector<1x1xf32> | |
%105 = splat %104 : vector<4xf32> | |
%106 = vector.fma %105, %103, %101 : vector<4xf32> | |
%107 = vector.extract_strided_slice %95 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%108 = vector.extract %40[0] : vector<1x4xf32> | |
%109 = vector.extract %107[0, 0] : vector<1x1xf32> | |
%110 = splat %109 : vector<4xf32> | |
%111 = vector.fma %110, %108, %106 : vector<4xf32> | |
%112 = vector.insert %111, %cst_1 [0] : vector<4xf32> into vector<1x4xf32> | |
scf.yield %58, %76, %94, %112 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32> | |
} | |
scf.yield %35#0, %35#1, %35#2, %35#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32> | |
} | |
vector.transfer_write %34#3, %29[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %34#2, %29[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %34#1, %29[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %34#0, %29[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer //----- // | |
module { | |
func @conv_dispatch_0() { | |
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32> | |
%cst_0 = constant 0.000000e+00 : f32 | |
%c6 = constant 6 : index | |
%c2 = constant 2 : index | |
%c4 = constant 4 : index | |
%c1 = constant 1 : index | |
%c3 = constant 3 : index | |
%c112 = constant 112 : index | |
%c0 = constant 0 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y] | |
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8] | |
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7] | |
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13] | |
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12] | |
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14] | |
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13] | |
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 { | |
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) { | |
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) { | |
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32> | |
%40 = vector.extract %38[0, 0] : vector<1x1xf32> | |
%41 = splat %40 : vector<4xf32> | |
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32> | |
%43 = vector.fma %41, %39, %42 : vector<4xf32> | |
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32> | |
%46 = vector.extract %44[0, 0] : vector<1x1xf32> | |
%47 = splat %46 : vector<4xf32> | |
%48 = vector.fma %47, %45, %43 : vector<4xf32> | |
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32> | |
%51 = vector.extract %49[0, 0] : vector<1x1xf32> | |
%52 = splat %51 : vector<4xf32> | |
%53 = vector.fma %52, %50, %48 : vector<4xf32> | |
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32> | |
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32> | |
%58 = vector.extract %56[0, 0] : vector<1x1xf32> | |
%59 = splat %58 : vector<4xf32> | |
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32> | |
%61 = vector.fma %59, %57, %60 : vector<4xf32> | |
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32> | |
%64 = vector.extract %62[0, 0] : vector<1x1xf32> | |
%65 = splat %64 : vector<4xf32> | |
%66 = vector.fma %65, %63, %61 : vector<4xf32> | |
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32> | |
%69 = vector.extract %67[0, 0] : vector<1x1xf32> | |
%70 = splat %69 : vector<4xf32> | |
%71 = vector.fma %70, %68, %66 : vector<4xf32> | |
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32> | |
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32> | |
%76 = vector.extract %74[0, 0] : vector<1x1xf32> | |
%77 = splat %76 : vector<4xf32> | |
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32> | |
%79 = vector.fma %77, %75, %78 : vector<4xf32> | |
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32> | |
%82 = vector.extract %80[0, 0] : vector<1x1xf32> | |
%83 = splat %82 : vector<4xf32> | |
%84 = vector.fma %83, %81, %79 : vector<4xf32> | |
%85 = vector.extract_strided_slice %73 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%86 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32> | |
%87 = vector.extract %85[0, 0] : vector<1x1xf32> | |
%88 = splat %87 : vector<4xf32> | |
%89 = vector.fma %88, %86, %84 : vector<4xf32> | |
%90 = vector.shape_cast %89 : vector<4xf32> to vector<1x4xf32> | |
%91 = vector.transfer_read %32[%c0, %c0, %c6, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%92 = vector.extract_strided_slice %91 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%93 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32> | |
%94 = vector.extract %92[0, 0] : vector<1x1xf32> | |
%95 = splat %94 : vector<4xf32> | |
%96 = vector.shape_cast %arg10 : vector<1x4xf32> to vector<4xf32> | |
%97 = vector.fma %95, %93, %96 : vector<4xf32> | |
%98 = vector.extract_strided_slice %91 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%99 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32> | |
%100 = vector.extract %98[0, 0] : vector<1x1xf32> | |
%101 = splat %100 : vector<4xf32> | |
%102 = vector.fma %101, %99, %97 : vector<4xf32> | |
%103 = vector.extract_strided_slice %91 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%104 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32> | |
%105 = vector.extract %103[0, 0] : vector<1x1xf32> | |
%106 = splat %105 : vector<4xf32> | |
%107 = vector.fma %106, %104, %102 : vector<4xf32> | |
%108 = vector.shape_cast %107 : vector<4xf32> to vector<1x4xf32> | |
scf.yield %54, %72, %90, %108 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32> | |
} | |
scf.yield %31#0, %31#1, %31#2, %31#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32> | |
} | |
vector.transfer_write %30#3, %25[%c0, %c0, %c3, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %30#2, %25[%c0, %c0, %c2, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %30#1, %25[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %30#0, %25[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
} | |
return | |
} | |
hal.interface private @io { | |
hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" | |
hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" | |
} | |
} | |
// -----// IR Dump After SPIRVCopyToWorkgroupMemory //----- // | |
func @conv_dispatch_0() { | |
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32> | |
%cst_0 = constant 0.000000e+00 : f32 | |
%c6 = constant 6 : index | |
%c2 = constant 2 : index | |
%c4 = constant 4 : index | |
%c1 = constant 1 : index | |
%c3 = constant 3 : index | |
%c112 = constant 112 : index | |
%c0 = constant 0 : index | |
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<1x225x225x3xf32> | |
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<3x3x3x32xf32> | |
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<1x112x112x32xf32> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%workgroup_id_z = hal.interface.workgroup.id[2] : index | |
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y] | |
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] | |
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y] | |
%6 = memref.subview %1[0, 0, 0, %4] [3, 3, 3, 32] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%7 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%8 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%9 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%10 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%8] | |
%11 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%7] | |
%12 = "gpu.thread_id"() {dimension = "x"} : () -> index | |
%13 = "gpu.thread_id"() {dimension = "y"} : () -> index | |
%14 = "gpu.thread_id"() {dimension = "z"} : () -> index | |
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13] | |
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%12] | |
%17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%14] | |
%18 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%13] | |
%19 = memref.subview %6[0, 0, 0, %16] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
scf.for %arg0 = %workgroup_id_z to %c112 step %c112 { | |
%20 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) | |
%21 = memref.subview %0[0, %20, %5, 0] [1, 3, 17, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%22 = memref.subview %2[0, %arg0, %3, %4] [1, 1, 8, 32] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%23 = memref.subview %22[0, %9, %10, %11] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
vector.transfer_write %cst, %23[%c0, %c0, %c3, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%24 = memref.subview %21[0, %17, %18, 0] [1, 3, 9, 3] [1, 1, 1, 1] : memref<1x3x17x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%25 = memref.subview %22[0, %14, %15, %16] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x1x8x32xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> | |
%26 = vector.transfer_read %25[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%27 = vector.transfer_read %25[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%28 = vector.transfer_read %25[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%29 = vector.transfer_read %25[%c0, %c0, %c3, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x4x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%30:4 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %26, %arg3 = %27, %arg4 = %28, %arg5 = %29) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) { | |
%31:4 = scf.for %arg6 = %c0 to %c3 step %c1 iter_args(%arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4, %arg10 = %arg5) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) { | |
%32 = memref.subview %24[0, %arg1, %arg6, 0] [1, 1, 7, 3] [1, 1, 1, 1] : memref<1x3x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> | |
%33 = memref.subview %19[%arg1, %arg6, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> | |
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32> | |
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%38 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%39 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32> | |
%40 = vector.extract %38[0, 0] : vector<1x1xf32> | |
%41 = splat %40 : vector<4xf32> | |
%42 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32> | |
%43 = vector.fma %41, %39, %42 : vector<4xf32> | |
%44 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%45 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32> | |
%46 = vector.extract %44[0, 0] : vector<1x1xf32> | |
%47 = splat %46 : vector<4xf32> | |
%48 = vector.fma %47, %45, %43 : vector<4xf32> | |
%49 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%50 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32> | |
%51 = vector.extract %49[0, 0] : vector<1x1xf32> | |
%52 = splat %51 : vector<4xf32> | |
%53 = vector.fma %52, %50, %48 : vector<4xf32> | |
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32> | |
%55 = vector.transfer_read %32[%c0, %c0, %c2, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%56 = vector.extract_strided_slice %55 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%57 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32> | |
%58 = vector.extract %56[0, 0] : vector<1x1xf32> | |
%59 = splat %58 : vector<4xf32> | |
%60 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32> | |
%61 = vector.fma %59, %57, %60 : vector<4xf32> | |
%62 = vector.extract_strided_slice %55 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%63 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32> | |
%64 = vector.extract %62[0, 0] : vector<1x1xf32> | |
%65 = splat %64 : vector<4xf32> | |
%66 = vector.fma %65, %63, %61 : vector<4xf32> | |
%67 = vector.extract_strided_slice %55 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%68 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32> | |
%69 = vector.extract %67[0, 0] : vector<1x1xf32> | |
%70 = splat %69 : vector<4xf32> | |
%71 = vector.fma %70, %68, %66 : vector<4xf32> | |
%72 = vector.shape_cast %71 : vector<4xf32> to vector<1x4xf32> | |
%73 = vector.transfer_read %32[%c0, %c0, %c4, %c0], %cst_0 {in_bounds = [true, true]} : memref<1x1x7x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32> | |
%74 = vector.extract_strided_slice %73 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%75 = vector.shape_cast %34 : vector<1x4xf32> to vector<4xf32> | |
%76 = vector.extract %74[0, 0] : vector<1x1xf32> | |
%77 = splat %76 : vector<4xf32> | |
%78 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32> | |
%79 = vector.fma %77, %75, %78 : vector<4xf32> | |
%80 = vector.extract_strided_slice %73 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32> | |
%81 = vector.shape_cast %35 : vector<1x4xf32> to vector<4xf32> | |
%82 = vector.extract %80[0, 0] : vector<1x1xf32> | |