Skip to content

Instantly share code, notes, and snippets.

@vmurali
Created August 17, 2022 00:28
Show Gist options
  • Save vmurali/2285bb7c5658331d0a1953c8103a6c7b to your computer and use it in GitHub Desktop.
Save vmurali/2285bb7c5658331d0a1953c8103a6c7b to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After IREEImportPublic (iree-import-public) ('builtin.module' operation) //----- //
module {
func.func @simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
}
}
// -----// IR Dump After ImportMLProgram (iree-import-ml-program) ('builtin.module' operation) //----- //
module {
func.func @simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
}
}
// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) ('builtin.module' operation) //----- //
module {
func.func @simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) ('builtin.module' operation) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = call @_simple_matmul_tensor(%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
func.func private @_simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_simple_matmul_tensor) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = call @_simple_matmul_tensor(%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
func.func private @_simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = call @_simple_matmul_tensor(%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
func.func private @_simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
return %0 : tensor<32x16xf32>
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
func.func private @_simple_matmul_tensor(tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
}
// -----// IR Dump After Inliner (inline) ('builtin.module' operation) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) ('builtin.module' operation) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass (iree-util-demote-f64-to-f32) ('builtin.module' operation) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertConv2D1x1ConvToMatmul (iree-flow-convert-conv2d-1x1-to-matmul) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After ExpandTensorShapes (iree-flow-expand-tensor-shapes) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass (iree-util-fixed-point-iterator) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After PadTensorToTensorInsertSlice (iree-flow-pad-tensor-to-tensor-insert-slice) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%6 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%9 = arith.addf %arg3, %arg4 : f32
linalg.yield %9 : f32
} -> tensor<32x16xf32>
%8 = hal.tensor.export %7 : tensor<32x16xf32> -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%6 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%9 = arith.addf %arg3, %arg4 : f32
linalg.yield %9 : f32
} -> tensor<32x16xf32>
%8 = hal.tensor.export %7 : tensor<32x16xf32> -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%8 = arith.addf %arg3, %arg4 : f32
linalg.yield %8 : f32
} -> tensor<32x16xf32>
%7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
return %7 : !hal.buffer_view
}
}
// -----// IR Dump After DispatchLinalgOnTensors (iree-flow-dispatch-linalg-on-tensors-pass) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
(%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
%12 = arith.addf %arg7, %arg8 : f32
linalg.yield %12 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
flow.return
} count(%arg3: index, %arg4: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
(%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
%12 = arith.addf %arg7, %arg8 : f32
linalg.yield %12 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
flow.return
} count(%arg3: index, %arg4: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
(%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
%12 = arith.addf %arg7, %arg8 : f32
linalg.yield %12 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
flow.return
} count(%arg3: index, %arg4: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
(%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
%12 = arith.addf %arg7, %arg8 : f32
linalg.yield %12 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
flow.return
} count(%arg3: index, %arg4: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
(%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
%12 = arith.addf %arg7, %arg8 : f32
linalg.yield %12 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
flow.return
} count(%arg3: index, %arg4: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
flow.return %x, %y, %z : index, index, index
}
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::StripDebugOpsPass (iree-util-strip-debug-ops) ('flow.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('flow.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('flow.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyInput (iree-stream-verify-input) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After OutlineConstants (iree-stream-outline-constants) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
flow.executable private @simple_matmul_tensor_dispatch_0 {
flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
%5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
%6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%7 = arith.addf %arg4, %arg5 : f32
linalg.yield %7 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
%1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
%2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
%3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
%4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToStream (iree-stream-conversion) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%cst = arith.constant 0.000000e+00 : f32
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c32_0 = arith.constant 32 : index
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32_0, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%c553648160_i32_1 = arith.constant 553648160 : i32
%c1_i32_2 = arith.constant 1 : i32
%c24_3 = arith.constant 24 : index
%c16_4 = arith.constant 16 : index
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24_3, %c16_4]) type(%c553648160_i32_1) encoding(%c1_i32_2)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%c553648160_i32_5 = arith.constant 553648160 : i32
%c1_i32_6 = arith.constant 1 : i32
%c32_7 = arith.constant 32 : index
%c16_8 = arith.constant 16 : index
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32_7, %c16_8]) type(%c553648160_i32_5) encoding(%c1_i32_6)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.tensor.sizeof tensor<32x16xf32> : index
%10 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9}
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} -> !stream.resource<external>{%9}
%12 = stream.tensor.export %11 : tensor<32x16xf32> in !stream.resource<external>{%9} -> !hal.buffer_view
return %12 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToTensors (iree-stream-verify-lowering-to-tensors) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%cst = arith.constant 0.000000e+00 : f32
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c32_0 = arith.constant 32 : index
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32_0, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
%c553648160_i32_1 = arith.constant 553648160 : i32
%c1_i32_2 = arith.constant 1 : i32
%c24_3 = arith.constant 24 : index
%c16_4 = arith.constant 16 : index
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24_3, %c16_4]) type(%c553648160_i32_1) encoding(%c1_i32_2)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
%c553648160_i32_5 = arith.constant 553648160 : i32
%c1_i32_6 = arith.constant 1 : i32
%c32_7 = arith.constant 32 : index
%c16_8 = arith.constant 16 : index
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32_7, %c16_8]) type(%c553648160_i32_5) encoding(%c1_i32_6)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.tensor.sizeof tensor<32x16xf32> : index
%10 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9}
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} -> !stream.resource<external>{%9}
%12 = stream.tensor.export %11 : tensor<32x16xf32> in !stream.resource<external>{%9} -> !hal.buffer_view
return %12 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%cst = arith.constant 0.000000e+00 : f32
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.tensor.sizeof tensor<32x16xf32> : index
%10 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9}
%11 = stream.async.transfer %10 : !stream.resource<*>{%9} -> !stream.resource<external>{%9}
%12 = stream.tensor.export %11 : tensor<32x16xf32> in !stream.resource<external>{%9} -> !hal.buffer_view
return %12 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%cst = arith.constant 0.000000e+00 : f32
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
%10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %11 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%cst = arith.constant 0.000000e+00 : f32
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
%10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %11 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
%10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %11 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
%10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %11 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
%10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %11 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::CombineInitializersPass (iree-util-combine-initializers) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
%10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %11 : !hal.buffer_view
}
}
// -----// IR Dump After EncodeDeviceTensors (iree-stream-encode-device-tensors) ('stream.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.sizeof tensor<32x24xf32> : index
%1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
%2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%3 = stream.tensor.sizeof tensor<24x16xf32> : index
%4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
%5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%6 = stream.tensor.sizeof tensor<32x16xf32> : index
%7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
%8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
%10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
%11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
return %11 : !hal.buffer_view
}
}
// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeBuiltins (iree-stream-materialize-builtins) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After ElideAsyncCopies (iree-stream-elide-async-copies) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
%1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
%6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After RefineUsage (iree-stream-refine-usage) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.timepoint.immediate => !stream.timepoint
%4 = stream.timepoint.immediate => !stream.timepoint
%5 = stream.timepoint.immediate => !stream.timepoint
%6 = stream.timepoint.immediate => !stream.timepoint
%results, %result_timepoint = stream.async.execute await(%6) => with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %9 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %8 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToAsync (iree-stream-verify-lowering-to-async) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
%5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
stream.yield %5 : !stream.resource<external>{%c2048}
} => !stream.timepoint
%3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
%4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %4 : !hal.buffer_view
}
}
// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After PackConstants (iree-stream-pack-constants) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::PropagateSubrangesPass (iree-util-propagate-subranges) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLoweringToCmd (iree-stream-verify-lowering-to-cmd) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After ElideTimepoints (iree-stream-elide-timepoints) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass (iree-util-fixed-point-iterator) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = arith.addi %c0, %arg4 : index
%1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%2 = arith.addi %c0, %arg5 : index
%3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%4 = arith.addi %c0, %arg6 : index
%5 = stream.binding.subspan %arg2[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%6 = arith.addi %c0, %arg7 : index
%7 = stream.binding.subspan %arg3[%6] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%10 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%11 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<32x16xf32>) -> tensor<32x16xf32>
%13 = linalg.matmul ins(%8, %9 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<32x16xf32>) -> tensor<32x16xf32>
%14 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%13, %10 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%11 : tensor<32x16xf32>) {
^bb0(%arg8: f32, %arg9: f32, %arg10: f32):
%15 = arith.addf %arg8, %arg9 : f32
linalg.yield %15 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %14, %7, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%c0_0 = arith.constant 0 : index
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%c0, %c0, %c0, %c0 : index, index, index, index) {
ro %arg3[%c0_0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0_0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0_0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0_0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After PackDispatchOperands (iree-stream-pack-dispatch-operands) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
%0 = arith.index_cast %arg4 : i32 to index
%1 = arith.index_cast %arg5 : i32 to index
%2 = arith.index_cast %arg6 : i32 to index
%3 = arith.index_cast %arg7 : i32 to index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%4 = arith.addi %c0, %0 : index
%5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%6 = arith.addi %c0, %1 : index
%7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%8 = arith.addi %c0, %2 : index
%9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%10 = arith.addi %c0, %3 : index
%11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
%17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
^bb0(%arg8: f32, %arg9: f32, %arg10: f32):
%19 = arith.addf %arg8, %arg9 : f32
linalg.yield %19 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%c0_0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c0_i32_1 = arith.constant 0 : i32
%c0_i32_2 = arith.constant 0 : i32
%c0_i32_3 = arith.constant 0 : i32
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%c0_i32, %c0_i32_1, %c0_i32_2, %c0_i32_3 : i32, i32, i32, i32) {
ro %arg3[%c0_0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0_0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0_0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0_0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
%0 = arith.index_cast %arg4 : i32 to index
%1 = arith.index_cast %arg5 : i32 to index
%2 = arith.index_cast %arg6 : i32 to index
%3 = arith.index_cast %arg7 : i32 to index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%4 = arith.addi %c0, %0 : index
%5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%6 = arith.addi %c0, %1 : index
%7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%8 = arith.addi %c0, %2 : index
%9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%10 = arith.addi %c0, %3 : index
%11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
%17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
^bb0(%arg8: f32, %arg9: f32, %arg10: f32):
%19 = arith.addf %arg8, %arg9 : f32
linalg.yield %19 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After FoldUniformOperands (iree-stream-fold-uniform-operands) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%2 = arith.index_cast %c0_i32 : i32 to index
%3 = arith.index_cast %c0_i32 : i32 to index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%4 = arith.addi %c0, %0 : index
%5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%6 = arith.addi %c0, %1 : index
%7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%8 = arith.addi %c0, %2 : index
%9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%10 = arith.addi %c0, %3 : index
%11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
%17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%19 = arith.addf %arg4, %arg5 : f32
linalg.yield %19 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%2 = arith.index_cast %c0_i32 : i32 to index
%3 = arith.index_cast %c0_i32 : i32 to index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%4 = arith.addi %c0, %0 : index
%5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%6 = arith.addi %c0, %1 : index
%7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%8 = arith.addi %c0, %2 : index
%9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%10 = arith.addi %c0, %3 : index
%11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
%17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%19 = arith.addf %arg4, %arg5 : f32
linalg.yield %19 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%2 = arith.index_cast %c0_i32 : i32 to index
%3 = arith.index_cast %c0_i32 : i32 to index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%4 = arith.addi %c0, %0 : index
%5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%6 = arith.addi %c0, %1 : index
%7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%8 = arith.addi %c0, %2 : index
%9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%10 = arith.addi %c0, %3 : index
%11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
%17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%19 = arith.addf %arg4, %arg5 : f32
linalg.yield %19 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%2 = arith.index_cast %c0_i32 : i32 to index
%3 = arith.index_cast %c0_i32 : i32 to index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%4 = arith.addi %c0, %0 : index
%5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%6 = arith.addi %c0, %1 : index
%7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%8 = arith.addi %c0, %2 : index
%9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%10 = arith.addi %c0, %3 : index
%11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
%17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%19 = arith.addf %arg4, %arg5 : f32
linalg.yield %19 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0_i32 = arith.constant 0 : i32
%0 = arith.index_cast %c0_i32 : i32 to index
%1 = arith.index_cast %c0_i32 : i32 to index
%2 = arith.index_cast %c0_i32 : i32 to index
%3 = arith.index_cast %c0_i32 : i32 to index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%4 = arith.addi %c0, %0 : index
%5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%6 = arith.addi %c0, %1 : index
%7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%8 = arith.addi %c0, %2 : index
%9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%10 = arith.addi %c0, %3 : index
%11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
%17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%19 = arith.addf %arg4, %arg5 : f32
linalg.yield %19 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After SymbolDCE (symbol-dce) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass (iree-hal-verify-target-environment) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
stream.executable private @simple_matmul_tensor_dispatch_0 {
stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
%2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
%11 = arith.addf %arg4, %arg5 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%11 = arith.addf %arg0, %arg1 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After VerifyLinalgTransformLegality (iree-llvmcpu-verify-linalg-transform-legality) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%11 = arith.addf %arg0, %arg1 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%11 = arith.addf %arg0, %arg1 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
%7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
%10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%11 = arith.addf %arg0, %arg1 : f32
linalg.yield %11 : f32
} -> tensor<32x16xf32>
flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
scf.for %arg0 = %4 to %c32 step %5 {
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg1 = %6 to %c16 step %7 {
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%9 = linalg.init_tensor [16, 16] : tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = linalg.init_tensor [16, 16] : tensor<16x16xf32>
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
%14 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%13 : tensor<16x16xf32>) -> tensor<16x16xf32>
%15 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%14, %8 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%9 : tensor<16x16xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
%16 = arith.addf %arg2, %arg3 : f32
linalg.yield %16 : f32
} -> tensor<16x16xf32>
flow.dispatch.tensor.store %15, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
scf.for %arg0 = %4 to %c32 step %5 {
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg1 = %6 to %c16 step %7 {
%8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = linalg.init_tensor [16, 16] : tensor<16x16xf32>
%13 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
%14 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%13 : tensor<16x16xf32>) -> tensor<16x16xf32>
%15 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%14 : tensor<16x16xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%16 = arith.addf %arg3, %arg2 : f32
linalg.yield %16 : f32
} -> tensor<16x16xf32>
flow.dispatch.tensor.store %15, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
scf.for %arg0 = %4 to %c32 step %5 {
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg1 = %6 to %c16 step %7 {
%8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
%13 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
%14 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%13 : tensor<16x16xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%15 = arith.addf %arg3, %arg2 : f32
linalg.yield %15 : f32
} -> tensor<16x16xf32>
flow.dispatch.tensor.store %14, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
scf.for %arg0 = %4 to %c32 step %5 {
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg1 = %6 to %c16 step %7 {
%8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
%13 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
%14 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%13 : tensor<16x16xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%15 = arith.addf %arg3, %arg2 : f32
linalg.yield %15 : f32
} -> tensor<16x16xf32>
flow.dispatch.tensor.store %14, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
scf.for %arg0 = %4 to %c32 step %5 {
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg1 = %6 to %c16 step %7 {
%8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
%13 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
%14 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%13 : tensor<16x16xf32>) {
^bb0(%arg2: f32, %arg3: f32):
%15 = arith.addf %arg3, %arg2 : f32
linalg.yield %15 : f32
} -> tensor<16x16xf32>
flow.dispatch.tensor.store %14, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyTileAndFusePass (linalg-strategy-tile-and-fuse-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0) -> (-d0 + 16, 32)>
#map3 = affine_map<(d0, d1) -> (-d0 + 16, d1)>
#map4 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
scf.for %arg0 = %4 to %c32 step %5 {
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg1 = %6 to %c16 step %7 {
%8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %8) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c16 step %c32 iter_args(%arg5 = %arg3) -> (tensor<16x16xf32>) {
%14 = affine.min #map2(%arg4)
%15 = affine.min #map2(%arg4)
%16 = tensor.extract_slice %9[%arg2, %arg4] [8, %14] [1, 1] : tensor<16x16xf32> to tensor<8x?xf32>
%17 = tensor.extract_slice %arg5[%arg2, %arg4] [8, %15] [1, 1] : tensor<16x16xf32> to tensor<8x?xf32>
%18 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%17 : tensor<8x?xf32>) -> tensor<8x?xf32>
%19 = affine.min #map3(%arg4, %15)
%20 = tensor.extract_slice %10[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%21 = tensor.extract_slice %11[0, %arg4] [24, %19] [1, 1] : tensor<24x16xf32> to tensor<24x?xf32>
%22 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%20, %21 : tensor<8x24xf32>, tensor<24x?xf32>) outs(%18 : tensor<8x?xf32>) -> tensor<8x?xf32>
%23 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x?xf32>) outs(%22 : tensor<8x?xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg6: f32, %arg7: f32):
%25 = arith.addf %arg7, %arg6 : f32
linalg.yield %25 : f32
} -> tensor<8x?xf32>
%24 = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [8, %15] [1, 1] : tensor<8x?xf32> into tensor<16x16xf32>
scf.yield %24 : tensor<16x16xf32>
}
scf.yield %13 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
%17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%20 = arith.addf %arg5, %arg4 : f32
linalg.yield %20 : f32
} -> tensor<8x16xf32>
%19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyTilePass (linalg-strategy-tile-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) attrs = {__internal_linalg_transform__ = "1"} {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
%20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
%21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
scf.yield %21 : tensor<8x16xf32>
}
%16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%19 = arith.addf %arg5, %arg4 : f32
linalg.yield %19 : f32
} -> tensor<8x16xf32>
%18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
scf.yield %18 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyVectorizePass (linalg-strategy-vectorize-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
%14 = vector.transfer_write %cst, %13[%c0, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<8x16xf32>
%15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
%20 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%21 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%22 = vector.transfer_read %arg5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x16xf32>, vector<8x16xf32>
%23 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %20, %21, %22 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
%24 = vector.transfer_write %23, %arg5[%c0, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<8x16xf32>
scf.yield %24 : tensor<8x16xf32>
}
%16 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%17 = vector.transfer_read %15[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x16xf32>, vector<8x16xf32>
%18 = arith.addf %17, %16 : vector<8x16xf32>
%19 = vector.transfer_write %18, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %19 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %19 : vector<8x16xf32>
}
%14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%15 = arith.addf %13, %14 : vector<8x16xf32>
%16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %16 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %19 : vector<8x16xf32>
}
%14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%15 = arith.addf %13, %14 : vector<8x16xf32>
%16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %16 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %19 : vector<8x16xf32>
}
%14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%15 = arith.addf %13, %14 : vector<8x16xf32>
%16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %16 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %19 : vector<8x16xf32>
}
%14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%15 = arith.addf %13, %14 : vector<8x16xf32>
%16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %16 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %19 : vector<8x16xf32>
}
%14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%15 = arith.addf %13, %14 : vector<8x16xf32>
%16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %16 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %19 : vector<8x16xf32>
}
%14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%15 = arith.addf %13, %14 : vector<8x16xf32>
%16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %16 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After LinalgInitTensorToAllocTensor (linalg-init-tensor-to-alloc-tensor) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply #map1()[%workgroup_id_y]
%5 = affine.apply #map1()[%workgroup_count_y]
%6 = affine.apply #map1()[%workgroup_id_x]
%7 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %4 to %c32 step %5 {
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
scf.for %arg1 = %6 to %c16 step %7 {
%9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
%10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
%11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
%12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
%13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
%18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
%19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %19 : vector<8x16xf32>
}
%14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
%15 = arith.addf %13, %14 : vector<8x16xf32>
%16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
scf.yield %16 : tensor<16x16xf32>
}
flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 24 + s0 + d1)>
#map3 = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map7 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<32x24xf32>
memref.assume_alignment %0, 64 : memref<32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<24x16xf32>
memref.assume_alignment %2, 64 : memref<24x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
memref.assume_alignment %4, 64 : memref<32x16xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
memref.assume_alignment %6, 64 : memref<32x16xf32>
%7 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply #map1()[%workgroup_id_y]
%9 = affine.apply #map1()[%workgroup_count_y]
%10 = affine.apply #map1()[%workgroup_id_x]
%11 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %8 to %c32 step %9 {
%12 = memref.subview %0[%arg0, 0] [16, 24] [1, 1] : memref<32x24xf32> to memref<16x24xf32, #map2>
%13 = bufferization.to_tensor %12 : memref<16x24xf32, #map2>
scf.for %arg1 = %10 to %c16 step %11 {
%14 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
%15 = bufferization.to_tensor %14 : memref<16x16xf32, #map3>
%16 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
%17 = bufferization.to_tensor %16 : memref<16x16xf32, #map3>
%18 = bufferization.to_tensor %2 : memref<24x16xf32>
%19 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %14) -> (memref<16x16xf32, #map3>) {
%22 = bufferization.to_tensor %arg3 : memref<16x16xf32, #map3>
%23 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%27 = vector.transfer_read %12[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : memref<16x24xf32, #map2>, vector<8x12xf32>
%28 = vector.transfer_read %2[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : memref<24x16xf32>, vector<12x16xf32>
%29 = vector.contract {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %28, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %29 : vector<8x16xf32>
}
%24 = vector.transfer_read %16[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, #map3>, vector<8x16xf32>
%25 = arith.addf %23, %24 : vector<8x16xf32>
vector.transfer_write %25, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, memref<16x16xf32, #map3>
%26 = bufferization.to_tensor %arg3 : memref<16x16xf32, #map3>
scf.yield %arg3 : memref<16x16xf32, #map3>
}
%20 = bufferization.to_tensor %19 : memref<16x16xf32, #map3>
%21 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel"]} ins(%19 : memref<16x16xf32, #map3>) outs(%21 : memref<16x16xf32, #map3>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
}
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 24 + s0 + d1)>
#map3 = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map7 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<32x24xf32>
memref.assume_alignment %0, 64 : memref<32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<24x16xf32>
memref.assume_alignment %2, 64 : memref<24x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
memref.assume_alignment %4, 64 : memref<32x16xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
memref.assume_alignment %6, 64 : memref<32x16xf32>
%7 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply #map1()[%workgroup_id_y]
%9 = affine.apply #map1()[%workgroup_count_y]
%10 = affine.apply #map1()[%workgroup_id_x]
%11 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %8 to %c32 step %9 {
%12 = memref.subview %0[%arg0, 0] [16, 24] [1, 1] : memref<32x24xf32> to memref<16x24xf32, #map2>
scf.for %arg1 = %10 to %c16 step %11 {
%13 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
%14 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
%15 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %13) -> (memref<16x16xf32, #map3>) {
%17 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
%20 = vector.transfer_read %12[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : memref<16x24xf32, #map2>, vector<8x12xf32>
%21 = vector.transfer_read %2[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : memref<24x16xf32>, vector<12x16xf32>
%22 = vector.contract {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %20, %21, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %22 : vector<8x16xf32>
}
%18 = vector.transfer_read %14[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, #map3>, vector<8x16xf32>
%19 = arith.addf %17, %18 : vector<8x16xf32>
vector.transfer_write %19, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, memref<16x16xf32, #map3>
scf.yield %arg3 : memref<16x16xf32, #map3>
}
%16 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel"]} ins(%15 : memref<16x16xf32, #map3>) outs(%16 : memref<16x16xf32, #map3>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
}
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
%6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 24 + s0 + d1)>
#map3 = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map7 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simple_matmul_tensor_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
%1 = affine.apply #map0()[%arg2]
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%c24 = arith.constant 24 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<32x24xf32>
memref.assume_alignment %0, 64 : memref<32x24xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<24x16xf32>
memref.assume_alignment %2, 64 : memref<24x16xf32>
%3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
%4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
memref.assume_alignment %4, 64 : memref<32x16xf32>
%5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
%6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
memref.assume_alignment %6, 64 : memref<32x16xf32>
%7 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%8 = affine.apply #map1()[%workgroup_id_y]
%9 = affine.apply #map1()[%workgroup_count_y]
%10 = affine.apply #map1()[%workgroup_id_x]
%11 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg0 = %8 to %c32 step %9 {
%12 = memref.subview %0[%arg0, 0] [16, 24] [1, 1] : memref<32x24xf32> to memref<16x24xf32, #map2>
scf.for %arg1 = %10 to %c16 step %11 {
%13 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
%14 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
scf.for %arg2 = %c0 to %c16 step %c8 {
%16 = scf.for %arg3 = %c0 to %c24 step %c12 iter_args(%arg4 = %cst) -> (vector<8x16xf32>) {
%19 = vector.transfer_read %12[%arg2, %arg3], %cst_0 {in_bounds = [true, true]} : memref<16x24xf32, #map2>, vector<8x12xf32>
%20 = vector.transfer_read %2[%arg3, %c0], %cst_0 {in_bounds = [true, true]} : memref<24x16xf32>, vector<12x16xf32>
%21 = vector.contract {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %19, %20, %arg4 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
scf.yield %21 : vector<8x16xf32>
}
%17 = vector.transfer_read %14[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, #map3>, vector<8x16xf32>
%18 = arith.addf %16, %17 : vector<8x16xf32>
vector.transfer_write %18, %13[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, memref<16x16xf32, #map3>
}
%15 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel"]} ins(%13 : memref<16x16xf32, #map3>) outs(%15 : memref<16x16xf32, #map3>) {
^bb0(%arg2: f32, %arg3: f32):
linalg.yield %arg2 : f32
}
}
}
return
}
}
}
}
func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c3072 = arith.constant 3072 : index
%c1536 = arith.constant 1536 : index
%c2048 = arith.constant 2048 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c24 = arith.constant 24 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment