vmurali/passes.mlir

## passes.mlir
// -----// IR Dump After IREEImportPublic (iree-import-public) ('builtin.module' operation) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    return %0 : tensor<32x16xf32>
  }
}


// -----// IR Dump After ImportMLProgram (iree-import-ml-program) ('builtin.module' operation) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    return %0 : tensor<32x16xf32>
  }
}


// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) ('builtin.module' operation) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    return %0 : tensor<32x16xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) ('builtin.module' operation) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = call @_simple_matmul_tensor(%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
  func.func private @_simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    return %0 : tensor<32x16xf32>
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @_simple_matmul_tensor) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = call @_simple_matmul_tensor(%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
  func.func private @_simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    return %0 : tensor<32x16xf32>
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = call @_simple_matmul_tensor(%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
  func.func private @_simple_matmul_tensor(%arg0: tensor<32x24xf32>, %arg1: tensor<24x16xf32>, %arg2: tensor<32x16xf32>) -> tensor<32x16xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%arg2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    return %0 : tensor<32x16xf32>
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
  func.func private @_simple_matmul_tensor(tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
}


// -----// IR Dump After Inliner (inline) ('builtin.module' operation) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) ('builtin.module' operation) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass (iree-util-demote-f64-to-f32) ('builtin.module' operation) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertConv2D1x1ConvToMatmul (iree-flow-convert-conv2d-1x1-to-matmul) ('func.func' operation: @simple_matmul_tensor) //----- //
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%2 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After DetachElementwiseFromNamedOps (iree-flow-detach-elementwise-from-named-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ExpandTensorShapes (iree-flow-expand-tensor-shapes) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass (iree-util-fixed-point-iterator) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After PadTensorToTensorInsertSlice (iree-flow-pad-tensor-to-tensor-insert-slice) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgFoldUnitExtentDims (linalg-fold-unit-extent-dims) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%4 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%6 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %9 = arith.addf %arg3, %arg4 : f32
      linalg.yield %9 : f32
    } -> tensor<32x16xf32>
    %8 = hal.tensor.export %7 : tensor<32x16xf32> -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%6 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %9 = arith.addf %arg3, %arg4 : f32
      linalg.yield %9 : f32
    } -> tensor<32x16xf32>
    %8 = hal.tensor.export %7 : tensor<32x16xf32> -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
    %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
      %8 = arith.addf %arg3, %arg4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x16xf32>
    %7 = hal.tensor.export %6 : tensor<32x16xf32> -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
}


// -----// IR Dump After DispatchLinalgOnTensors (iree-flow-dispatch-linalg-on-tensors-pass) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
        (%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
      %cst = arith.constant 0.000000e+00 : f32
      %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
      %6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
      %7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
      %8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
      %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
      ^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
        %12 = arith.addf %arg7, %arg8 : f32
        linalg.yield %12 : f32
      } -> tensor<32x16xf32>
      flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
      flow.return
    } count(%arg3: index, %arg4: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
      flow.return %x, %y, %z : index, index, index
    }
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
        (%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
      %cst = arith.constant 0.000000e+00 : f32
      %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
      %6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
      %7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
      %8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
      %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
      ^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
        %12 = arith.addf %arg7, %arg8 : f32
        linalg.yield %12 : f32
      } -> tensor<32x16xf32>
      flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
      flow.return
    } count(%arg3: index, %arg4: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
      flow.return %x, %y, %z : index, index, index
    }
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
        (%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
      %cst = arith.constant 0.000000e+00 : f32
      %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
      %6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
      %7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
      %8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
      %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
      ^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
        %12 = arith.addf %arg7, %arg8 : f32
        linalg.yield %12 : f32
      } -> tensor<32x16xf32>
      flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
      flow.return
    } count(%arg3: index, %arg4: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
      flow.return %x, %y, %z : index, index, index
    }
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
        (%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
      %cst = arith.constant 0.000000e+00 : f32
      %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
      %6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
      %7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
      %8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
      %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
      ^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
        %12 = arith.addf %arg7, %arg8 : f32
        linalg.yield %12 : f32
      } -> tensor<32x16xf32>
      flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
      flow.return
    } count(%arg3: index, %arg4: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
      flow.return %x, %y, %z : index, index, index
    }
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch.workgroups[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32> =
        (%arg3: !flow.dispatch.tensor<readonly:32x24xf32>, %arg4: !flow.dispatch.tensor<readonly:24x16xf32>, %arg5: !flow.dispatch.tensor<readonly:32x16xf32>, %arg6: !flow.dispatch.tensor<writeonly:32x16xf32>) {
      %cst = arith.constant 0.000000e+00 : f32
      %5 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
      %6 = flow.dispatch.tensor.load %arg4, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
      %7 = flow.dispatch.tensor.load %arg5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
      %8 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
      %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %10 = linalg.matmul ins(%5, %6 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%9 : tensor<32x16xf32>) -> tensor<32x16xf32>
      %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%10, %7 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%8 : tensor<32x16xf32>) {
      ^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
        %12 = arith.addf %arg7, %arg8 : f32
        linalg.yield %12 : f32
      } -> tensor<32x16xf32>
      flow.dispatch.tensor.store %11, %arg6, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
      flow.return
    } count(%arg3: index, %arg4: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg3, %arg4
      flow.return %x, %y, %z : index, index, index
    }
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::StripDebugOpsPass (iree-util-strip-debug-ops) ('flow.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('flow.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('flow.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyInput (iree-stream-verify-input) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineConstants (iree-stream-outline-constants) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  flow.executable private @simple_matmul_tensor_dispatch_0 {
    flow.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !flow.dispatch.tensor<readonly:32x24xf32>, %arg1: !flow.dispatch.tensor<readonly:24x16xf32>, %arg2: !flow.dispatch.tensor<readonly:32x16xf32>, %arg3: !flow.dispatch.tensor<writeonly:32x16xf32>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %2 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %3 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %5 = linalg.matmul ins(%0, %1 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%4 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %2 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%3 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %7 = arith.addf %arg4, %arg5 : f32
          linalg.yield %7 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %6, %arg3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32>
    %2 = hal.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32>
    %3 = flow.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (tensor<32x24xf32>, tensor<24x16xf32>, tensor<32x16xf32>) -> tensor<32x16xf32>
    %4 = hal.tensor.export %3 : tensor<32x16xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertToStream (iree-stream-conversion) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c32_0 = arith.constant 32 : index
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32_0, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %c553648160_i32_1 = arith.constant 553648160 : i32
    %c1_i32_2 = arith.constant 1 : i32
    %c24_3 = arith.constant 24 : index
    %c16_4 = arith.constant 16 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24_3, %c16_4]) type(%c553648160_i32_1) encoding(%c1_i32_2)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %c553648160_i32_5 = arith.constant 553648160 : i32
    %c1_i32_6 = arith.constant 1 : i32
    %c32_7 = arith.constant 32 : index
    %c16_8 = arith.constant 16 : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32_7, %c16_8]) type(%c553648160_i32_5) encoding(%c1_i32_6)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.tensor.sizeof tensor<32x16xf32> : index
    %10 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<*>{%9} -> !stream.resource<external>{%9}
    %12 = stream.tensor.export %11 : tensor<32x16xf32> in !stream.resource<external>{%9} -> !hal.buffer_view
    return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToTensors (iree-stream-verify-lowering-to-tensors) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c32_0 = arith.constant 32 : index
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32_0, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %c553648160_i32_1 = arith.constant 553648160 : i32
    %c1_i32_2 = arith.constant 1 : i32
    %c24_3 = arith.constant 24 : index
    %c16_4 = arith.constant 16 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24_3, %c16_4]) type(%c553648160_i32_1) encoding(%c1_i32_2)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %c553648160_i32_5 = arith.constant 553648160 : i32
    %c1_i32_6 = arith.constant 1 : i32
    %c32_7 = arith.constant 32 : index
    %c16_8 = arith.constant 16 : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32_7, %c16_8]) type(%c553648160_i32_5) encoding(%c1_i32_6)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.tensor.sizeof tensor<32x16xf32> : index
    %10 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<*>{%9} -> !stream.resource<external>{%9}
    %12 = stream.tensor.export %11 : tensor<32x16xf32> in !stream.resource<external>{%9} -> !hal.buffer_view
    return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.tensor.sizeof tensor<32x16xf32> : index
    %10 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<*>{%9} -> !stream.resource<external>{%9}
    %12 = stream.tensor.export %11 : tensor<32x16xf32> in !stream.resource<external>{%9} -> !hal.buffer_view
    return %12 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %cst = arith.constant 0.000000e+00 : f32
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::CombineInitializersPass (iree-util-combine-initializers) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After EncodeDeviceTensors (iree-stream-encode-device-tensors) ('stream.executable' operation: @simple_matmul_tensor_dispatch_0) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<32x24xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<24x16xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %6 = stream.tensor.sizeof tensor<32x16xf32> : index
    %7 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<external>{%6} -> !stream.resource<*>{%6}
    %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%2, %5, %8) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}, !stream.resource<*>{%6}) -> !stream.resource<*>{%6}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %11 = stream.tensor.export %10 : tensor<32x16xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %11 : !hal.buffer_view
  }
}


// -----// IR Dump After EncodeHostTensors (iree-stream-encode-host-tensors) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeBuiltins (iree-stream-materialize-builtins) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeCopyOnWrite (iree-stream-materialize-copy-on-write) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After ElideAsyncCopies (iree-stream-elide-async-copies) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c3072} -> !stream.resource<*>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c1536} -> !stream.resource<*>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %4 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%c2048} -> !stream.resource<*>{%c2048}
    %6 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%1, %3, %5) : (!stream.resource<*>{%c3072}, !stream.resource<*>{%c1536}, !stream.resource<*>{%c2048}) -> !stream.resource<*>{%c2048}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c2048} -> !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After RefineUsage (iree-stream-refine-usage) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%0, %1, %2) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After ScheduleExecution (iree-stream-schedule-execution) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After ScheduleConcurrency (iree-stream-schedule-concurrency) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After PropagateTimepoints (iree-stream-propagate-timepoints) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.timepoint.immediate => !stream.timepoint
    %4 = stream.timepoint.immediate => !stream.timepoint
    %5 = stream.timepoint.immediate => !stream.timepoint
    %6 = stream.timepoint.immediate => !stream.timepoint
    %results, %result_timepoint = stream.async.execute await(%6) => with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %9 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %9 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %7 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %8 = stream.tensor.export %7 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %8 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToAsync (iree-stream-verify-lowering-to-async) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048} {
      %5 = stream.async.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%arg3, %arg4, %arg5) : (!stream.resource<external>{%c3072}, !stream.resource<external>{%c1536}, !stream.resource<external>{%c2048}) -> !stream.resource<external>{%c2048}
      stream.yield %5 : !stream.resource<external>{%c2048}
    } => !stream.timepoint
    %3 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c2048}
    %4 = stream.tensor.export %3 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
}


// -----// IR Dump After ScheduleAllocation (iree-stream-schedule-allocation) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After PackConstants (iree-stream-pack-constants) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After PackAllocations (iree-stream-pack-allocations) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LayoutSlices (iree-stream-layout-slices) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::PropagateSubrangesPass (iree-util-propagate-subranges) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToCmd (iree-stream-verify-lowering-to-cmd) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After ElideTimepoints (iree-stream-elide-timepoints) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {iree.fixedpoint.iteration = 0 : index} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass (iree-util-fixed-point-iterator) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseDispatchBindings (iree-stream-fuse-dispatch-bindings) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = arith.addi %c0, %arg4 : index
        %1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %2 = arith.addi %c0, %arg5 : index
        %3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %4 = arith.addi %c0, %arg6 : index
        %5 = stream.binding.subspan %arg2[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %6 = arith.addi %c0, %arg7 : index
        %7 = stream.binding.subspan %arg3[%6] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %10 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %11 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %13 = linalg.matmul ins(%8, %9 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %14 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%13, %10 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%11 : tensor<32x16xf32>) {
        ^bb0(%arg8: f32, %arg9: f32, %arg10: f32):
          %15 = arith.addf %arg8, %arg9 : f32
          linalg.yield %15 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %14, %7, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %c0_0 = arith.constant 0 : index
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%c0, %c0, %c0, %c0 : index, index, index, index) {
        ro %arg3[%c0_0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0_0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0_0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0_0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After PackDispatchOperands (iree-stream-pack-dispatch-operands) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
        %0 = arith.index_cast %arg4 : i32 to index
        %1 = arith.index_cast %arg5 : i32 to index
        %2 = arith.index_cast %arg6 : i32 to index
        %3 = arith.index_cast %arg7 : i32 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %4 = arith.addi %c0, %0 : index
        %5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %6 = arith.addi %c0, %1 : index
        %7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %8 = arith.addi %c0, %2 : index
        %9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %10 = arith.addi %c0, %3 : index
        %11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
        ^bb0(%arg8: f32, %arg9: f32, %arg10: f32):
          %19 = arith.addf %arg8, %arg9 : f32
          linalg.yield %19 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %c0_0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i32_1 = arith.constant 0 : i32
    %c0_i32_2 = arith.constant 0 : i32
    %c0_i32_3 = arith.constant 0 : i32
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%c0_i32, %c0_i32_1, %c0_i32_2, %c0_i32_3 : i32, i32, i32, i32) {
        ro %arg3[%c0_0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0_0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0_0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0_0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
        %0 = arith.index_cast %arg4 : i32 to index
        %1 = arith.index_cast %arg5 : i32 to index
        %2 = arith.index_cast %arg6 : i32 to index
        %3 = arith.index_cast %arg7 : i32 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %4 = arith.addi %c0, %0 : index
        %5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %6 = arith.addi %c0, %1 : index
        %7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %8 = arith.addi %c0, %2 : index
        %9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %10 = arith.addi %c0, %3 : index
        %11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
        ^bb0(%arg8: f32, %arg9: f32, %arg10: f32):
          %19 = arith.addf %arg8, %arg9 : f32
          linalg.yield %19 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16](%c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32) {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldUniformOperands (iree-stream-fold-uniform-operands) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %2 = arith.index_cast %c0_i32 : i32 to index
        %3 = arith.index_cast %c0_i32 : i32 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %4 = arith.addi %c0, %0 : index
        %5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %6 = arith.addi %c0, %1 : index
        %7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %8 = arith.addi %c0, %2 : index
        %9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %10 = arith.addi %c0, %3 : index
        %11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %19 = arith.addf %arg4, %arg5 : f32
          linalg.yield %19 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After AnnotateDispatchArguments (iree-stream-annotate-dispatch-arguments) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %2 = arith.index_cast %c0_i32 : i32 to index
        %3 = arith.index_cast %c0_i32 : i32 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %4 = arith.addi %c0, %0 : index
        %5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %6 = arith.addi %c0, %1 : index
        %7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %8 = arith.addi %c0, %2 : index
        %9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %10 = arith.addi %c0, %3 : index
        %11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %19 = arith.addf %arg4, %arg5 : f32
          linalg.yield %19 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %2 = arith.index_cast %c0_i32 : i32 to index
        %3 = arith.index_cast %c0_i32 : i32 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %4 = arith.addi %c0, %0 : index
        %5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %6 = arith.addi %c0, %1 : index
        %7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %8 = arith.addi %c0, %2 : index
        %9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %10 = arith.addi %c0, %3 : index
        %11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %19 = arith.addf %arg4, %arg5 : f32
          linalg.yield %19 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %2 = arith.index_cast %c0_i32 : i32 to index
        %3 = arith.index_cast %c0_i32 : i32 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %4 = arith.addi %c0, %0 : index
        %5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %6 = arith.addi %c0, %1 : index
        %7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %8 = arith.addi %c0, %2 : index
        %9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %10 = arith.addi %c0, %3 : index
        %11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %19 = arith.addf %arg4, %arg5 : f32
          linalg.yield %19 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %2 = arith.index_cast %c0_i32 : i32 to index
        %3 = arith.index_cast %c0_i32 : i32 to index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %4 = arith.addi %c0, %0 : index
        %5 = stream.binding.subspan %arg0[%4] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %6 = arith.addi %c0, %1 : index
        %7 = stream.binding.subspan %arg1[%6] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %8 = arith.addi %c0, %2 : index
        %9 = stream.binding.subspan %arg2[%8] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %10 = arith.addi %c0, %3 : index
        %11 = stream.binding.subspan %arg3[%10] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %12 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %13 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %15 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %17 = linalg.matmul ins(%12, %13 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%16 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%15 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %19 = arith.addf %arg4, %arg5 : f32
          linalg.yield %19 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After SymbolDCE (symbol-dce) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass (iree-util-simplify-global-accesses) ('func.func' operation: @simple_matmul_tensor) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass (iree-util-apply-patterns) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass (iree-util-fold-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass (iree-util-fuse-globals) ('builtin.module' operation) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass (iree-hal-assign-target-devices) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass (iree-hal-verify-target-environment) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @simple_matmul_tensor_dispatch_0 {
    stream.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg0, %arg1
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x24xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:24x16xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:32x16xf32>
        %3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:32x16xf32>
        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
        %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
        %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
          %11 = arith.addf %arg4, %arg5 : f32
          linalg.yield %11 : f32
        } -> tensor<32x16xf32>
        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
        return
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      }
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
          %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
          %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
          %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
          ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
            %11 = arith.addf %arg0, %arg1 : f32
            linalg.yield %11 : f32
          } -> tensor<32x16xf32>
          flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLinalgTransformLegality (iree-llvmcpu-verify-linalg-transform-legality) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
          %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
          %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
          %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
          ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
            %11 = arith.addf %arg0, %arg1 : f32
            linalg.yield %11 : f32
          } -> tensor<32x16xf32>
          flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
          %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
          %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
          %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
          ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
            %11 = arith.addf %arg0, %arg1 : f32
            linalg.yield %11 : f32
          } -> tensor<32x16xf32>
          flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<32x24xf32>
          %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
          %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<32x16xf32>
          %7 = linalg.init_tensor [32, 16] : tensor<32x16xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %9 = linalg.matmul ins(%4, %5 : tensor<32x24xf32>, tensor<24x16xf32>) outs(%8 : tensor<32x16xf32>) -> tensor<32x16xf32>
          %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<32x16xf32>, tensor<32x16xf32>) outs(%7 : tensor<32x16xf32>) {
          ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
            %11 = arith.addf %arg0, %arg1 : f32
            linalg.yield %11 : f32
          } -> tensor<32x16xf32>
          flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : tensor<32x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) ('hal.executable.variant' operation: @embedded_elf_x86_64) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          scf.for %arg0 = %4 to %c32 step %5 {
            %6 = affine.apply #map1()[%workgroup_id_x]
            %7 = affine.apply #map1()[%workgroup_count_x]
            scf.for %arg1 = %6 to %c16 step %7 {
              %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %9 = linalg.init_tensor [16, 16] : tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = linalg.init_tensor [16, 16] : tensor<16x16xf32>
              %13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %14 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%13 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %15 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%14, %8 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%9 : tensor<16x16xf32>) {
              ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
                %16 = arith.addf %arg2, %arg3 : f32
                linalg.yield %16 : f32
              } -> tensor<16x16xf32>
              flow.dispatch.tensor.store %15, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          scf.for %arg0 = %4 to %c32 step %5 {
            %6 = affine.apply #map1()[%workgroup_id_x]
            %7 = affine.apply #map1()[%workgroup_count_x]
            scf.for %arg1 = %6 to %c16 step %7 {
              %8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = linalg.init_tensor [16, 16] : tensor<16x16xf32>
              %13 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %14 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%13 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %15 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%14 : tensor<16x16xf32>) {
              ^bb0(%arg2: f32, %arg3: f32):
                %16 = arith.addf %arg3, %arg2 : f32
                linalg.yield %16 : f32
              } -> tensor<16x16xf32>
              flow.dispatch.tensor.store %15, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          scf.for %arg0 = %4 to %c32 step %5 {
            %6 = affine.apply #map1()[%workgroup_id_x]
            %7 = affine.apply #map1()[%workgroup_count_x]
            scf.for %arg1 = %6 to %c16 step %7 {
              %8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %13 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %14 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%13 : tensor<16x16xf32>) {
              ^bb0(%arg2: f32, %arg3: f32):
                %15 = arith.addf %arg3, %arg2 : f32
                linalg.yield %15 : f32
              } -> tensor<16x16xf32>
              flow.dispatch.tensor.store %14, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          scf.for %arg0 = %4 to %c32 step %5 {
            %6 = affine.apply #map1()[%workgroup_id_x]
            %7 = affine.apply #map1()[%workgroup_count_x]
            scf.for %arg1 = %6 to %c16 step %7 {
              %8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %13 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %14 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%13 : tensor<16x16xf32>) {
              ^bb0(%arg2: f32, %arg3: f32):
                %15 = arith.addf %arg3, %arg2 : f32
                linalg.yield %15 : f32
              } -> tensor<16x16xf32>
              flow.dispatch.tensor.store %14, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          scf.for %arg0 = %4 to %c32 step %5 {
            %6 = affine.apply #map1()[%workgroup_id_x]
            %7 = affine.apply #map1()[%workgroup_count_x]
            scf.for %arg1 = %6 to %c16 step %7 {
              %8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %13 = linalg.matmul {lowering_config = #config} ins(%10, %11 : tensor<16x24xf32>, tensor<24x16xf32>) outs(%12 : tensor<16x16xf32>) -> tensor<16x16xf32>
              %14 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<16x16xf32>) outs(%13 : tensor<16x16xf32>) {
              ^bb0(%arg2: f32, %arg3: f32):
                %15 = arith.addf %arg3, %arg2 : f32
                linalg.yield %15 : f32
              } -> tensor<16x16xf32>
              flow.dispatch.tensor.store %14, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyTileAndFusePass (linalg-strategy-tile-and-fuse-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0) -> (-d0 + 16, 32)>
#map3 = affine_map<(d0, d1) -> (-d0 + 16, d1)>
#map4 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          scf.for %arg0 = %4 to %c32 step %5 {
            %6 = affine.apply #map1()[%workgroup_id_x]
            %7 = affine.apply #map1()[%workgroup_count_x]
            scf.for %arg1 = %6 to %c16 step %7 {
              %8 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %8) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c16 step %c32 iter_args(%arg5 = %arg3) -> (tensor<16x16xf32>) {
                  %14 = affine.min #map2(%arg4)
                  %15 = affine.min #map2(%arg4)
                  %16 = tensor.extract_slice %9[%arg2, %arg4] [8, %14] [1, 1] : tensor<16x16xf32> to tensor<8x?xf32>
                  %17 = tensor.extract_slice %arg5[%arg2, %arg4] [8, %15] [1, 1] : tensor<16x16xf32> to tensor<8x?xf32>
                  %18 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%17 : tensor<8x?xf32>) -> tensor<8x?xf32>
                  %19 = affine.min #map3(%arg4, %15)
                  %20 = tensor.extract_slice %10[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                  %21 = tensor.extract_slice %11[0, %arg4] [24, %19] [1, 1] : tensor<24x16xf32> to tensor<24x?xf32>
                  %22 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%20, %21 : tensor<8x24xf32>, tensor<24x?xf32>) outs(%18 : tensor<8x?xf32>) -> tensor<8x?xf32>
                  %23 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x?xf32>) outs(%22 : tensor<8x?xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                  ^bb0(%arg6: f32, %arg7: f32):
                    %25 = arith.addf %arg7, %arg6 : f32
                    linalg.yield %25 : f32
                  } -> tensor<8x?xf32>
                  %24 = tensor.insert_slice %23 into %arg5[%arg2, %arg4] [8, %15] [1, 1] : tensor<8x?xf32> into tensor<16x16xf32>
                  scf.yield %24 : tensor<16x16xf32>
                }
                scf.yield %13 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = tensor.extract_slice %8[%arg2, 0] [8, 24] [1, 1] : tensor<16x24xf32> to tensor<8x24xf32>
                %16 = linalg.matmul {lowering_config = #config} ins(%15, %11 : tensor<8x24xf32>, tensor<24x16xf32>) outs(%14 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %17 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %18 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%17 : tensor<8x16xf32>) outs(%16 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %20 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %20 : f32
                } -> tensor<8x16xf32>
                %19 = tensor.insert_slice %18 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyTilePass (linalg-strategy-tile-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) attrs =  {__internal_linalg_transform__ = "1"} {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyPadPass (linalg-strategy-pad-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {__internal_linalg_transform__ = "1", lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgFuse (linalg-fuse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('builtin.module' operation) //----- //
#config = #iree_codegen.lowering_config<tile_sizes = [[16, 16, 0], [8, 32, 0], [0, 0, 12]]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<8x16xf32>) -> tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %19 = tensor.extract_slice %8[%arg2, %arg4] [8, 12] [1, 1] : tensor<16x24xf32> to tensor<8x12xf32>
                  %20 = tensor.extract_slice %11[%arg4, 0] [12, 16] [1, 1] : tensor<24x16xf32> to tensor<12x16xf32>
                  %21 = linalg.matmul {lowering_config = #config} ins(%19, %20 : tensor<8x12xf32>, tensor<12x16xf32>) outs(%arg5 : tensor<8x16xf32>) -> tensor<8x16xf32>
                  scf.yield %21 : tensor<8x16xf32>
                }
                %16 = tensor.extract_slice %10[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %17 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<8x16xf32>) outs(%15 : tensor<8x16xf32>) {
                ^bb0(%arg4: f32, %arg5: f32):
                  %19 = arith.addf %arg5, %arg4 : f32
                  linalg.yield %19 : f32
                } -> tensor<8x16xf32>
                %18 = tensor.insert_slice %17 into %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<16x16xf32>
                scf.yield %18 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyVectorizePass (linalg-strategy-vectorize-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = tensor.extract_slice %arg3[%arg2, 0] [8, 16] [1, 1] : tensor<16x16xf32> to tensor<8x16xf32>
                %14 = vector.transfer_write %cst, %13[%c0, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<8x16xf32>
                %15 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %14) -> (tensor<8x16xf32>) {
                  %20 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %21 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %22 = vector.transfer_read %arg5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x16xf32>, vector<8x16xf32>
                  %23 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %20, %21, %22 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  %24 = vector.transfer_write %23, %arg5[%c0, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<8x16xf32>
                  scf.yield %24 : tensor<8x16xf32>
                }
                %16 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %17 = vector.transfer_read %15[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x16xf32>, vector<8x16xf32>
                %18 = arith.addf %17, %16 : vector<8x16xf32>
                %19 = vector.transfer_write %18, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %19 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %19 : vector<8x16xf32>
                }
                %14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %15 = arith.addf %13, %14 : vector<8x16xf32>
                %16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %16 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyEnablePass (linalg-strategy-enable-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %19 : vector<8x16xf32>
                }
                %14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %15 = arith.addf %13, %14 : vector<8x16xf32>
                %16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %16 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgStrategyRemoveMarkersPass (linalg-strategy-remove-markers-pass) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %19 : vector<8x16xf32>
                }
                %14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %15 = arith.addf %13, %14 : vector<8x16xf32>
                %16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %16 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgSingleTilingExpert (linalg-single-tiling-expert-driver) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %19 : vector<8x16xf32>
                }
                %14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %15 = arith.addf %13, %14 : vector<8x16xf32>
                %16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %16 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %19 : vector<8x16xf32>
                }
                %14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %15 = arith.addf %13, %14 : vector<8x16xf32>
                %16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %16 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %19 : vector<8x16xf32>
                }
                %14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %15 = arith.addf %13, %14 : vector<8x16xf32>
                %16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %16 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After LinalgInitTensorToAllocTensor (linalg-init-tensor-to-alloc-tensor) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %4 = affine.apply #map1()[%workgroup_id_y]
          %5 = affine.apply #map1()[%workgroup_count_y]
          %6 = affine.apply #map1()[%workgroup_id_x]
          %7 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %4 to %c32 step %5 {
            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x24xf32> -> tensor<16x24xf32>
            scf.for %arg1 = %6 to %c16 step %7 {
              %9 = flow.dispatch.tensor.load %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<writeonly:32x16xf32> -> tensor<16x16xf32>
              %10 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xf32> -> tensor<16x16xf32>
              %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [24, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x16xf32> -> tensor<24x16xf32>
              %12 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %9) -> (tensor<16x16xf32>) {
                %13 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %17 = vector.transfer_read %8[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : tensor<16x24xf32>, vector<8x12xf32>
                  %18 = vector.transfer_read %11[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : tensor<24x16xf32>, vector<12x16xf32>
                  %19 = vector.contract {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %19 : vector<8x16xf32>
                }
                %14 = vector.transfer_read %10[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : tensor<16x16xf32>, vector<8x16xf32>
                %15 = arith.addf %13, %14 : vector<8x16xf32>
                %16 = vector.transfer_write %15, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, tensor<16x16xf32>
                scf.yield %16 : tensor<16x16xf32>
              }
              flow.dispatch.tensor.store %12, %3, offsets = [%arg0, %arg1], sizes = [16, 16], strides = [1, 1] : tensor<16x16xf32> -> !flow.dispatch.tensor<writeonly:32x16xf32>
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 24 + s0 + d1)>
#map3 = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map7 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<32x24xf32>
          memref.assume_alignment %0, 64 : memref<32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<24x16xf32>
          memref.assume_alignment %2, 64 : memref<24x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
          memref.assume_alignment %4, 64 : memref<32x16xf32>
          %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
          memref.assume_alignment %6, 64 : memref<32x16xf32>
          %7 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %8 = affine.apply #map1()[%workgroup_id_y]
          %9 = affine.apply #map1()[%workgroup_count_y]
          %10 = affine.apply #map1()[%workgroup_id_x]
          %11 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %8 to %c32 step %9 {
            %12 = memref.subview %0[%arg0, 0] [16, 24] [1, 1] : memref<32x24xf32> to memref<16x24xf32, #map2>
            %13 = bufferization.to_tensor %12 : memref<16x24xf32, #map2>
            scf.for %arg1 = %10 to %c16 step %11 {
              %14 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              %15 = bufferization.to_tensor %14 : memref<16x16xf32, #map3>
              %16 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              %17 = bufferization.to_tensor %16 : memref<16x16xf32, #map3>
              %18 = bufferization.to_tensor %2 : memref<24x16xf32>
              %19 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %14) -> (memref<16x16xf32, #map3>) {
                %22 = bufferization.to_tensor %arg3 : memref<16x16xf32, #map3>
                %23 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %27 = vector.transfer_read %12[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : memref<16x24xf32, #map2>, vector<8x12xf32>
                  %28 = vector.transfer_read %2[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : memref<24x16xf32>, vector<12x16xf32>
                  %29 = vector.contract {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %28, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %29 : vector<8x16xf32>
                }
                %24 = vector.transfer_read %16[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, #map3>, vector<8x16xf32>
                %25 = arith.addf %23, %24 : vector<8x16xf32>
                vector.transfer_write %25, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, memref<16x16xf32, #map3>
                %26 = bufferization.to_tensor %arg3 : memref<16x16xf32, #map3>
                scf.yield %arg3 : memref<16x16xf32, #map3>
              }
              %20 = bufferization.to_tensor %19 : memref<16x16xf32, #map3>
              %21 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel"]} ins(%19 : memref<16x16xf32, #map3>) outs(%21 : memref<16x16xf32, #map3>) {
              ^bb0(%arg2: f32, %arg3: f32):
                linalg.yield %arg2 : f32
              }
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) ('builtin.module' operation) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 24 + s0 + d1)>
#map3 = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map7 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<32x24xf32>
          memref.assume_alignment %0, 64 : memref<32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<24x16xf32>
          memref.assume_alignment %2, 64 : memref<24x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
          memref.assume_alignment %4, 64 : memref<32x16xf32>
          %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
          memref.assume_alignment %6, 64 : memref<32x16xf32>
          %7 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %8 = affine.apply #map1()[%workgroup_id_y]
          %9 = affine.apply #map1()[%workgroup_count_y]
          %10 = affine.apply #map1()[%workgroup_id_x]
          %11 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %8 to %c32 step %9 {
            %12 = memref.subview %0[%arg0, 0] [16, 24] [1, 1] : memref<32x24xf32> to memref<16x24xf32, #map2>
            scf.for %arg1 = %10 to %c16 step %11 {
              %13 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              %14 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              %15 = scf.for %arg2 = %c0 to %c16 step %c8 iter_args(%arg3 = %13) -> (memref<16x16xf32, #map3>) {
                %17 = scf.for %arg4 = %c0 to %c24 step %c12 iter_args(%arg5 = %cst) -> (vector<8x16xf32>) {
                  %20 = vector.transfer_read %12[%arg2, %arg4], %cst_0 {in_bounds = [true, true]} : memref<16x24xf32, #map2>, vector<8x12xf32>
                  %21 = vector.transfer_read %2[%arg4, %c0], %cst_0 {in_bounds = [true, true]} : memref<24x16xf32>, vector<12x16xf32>
                  %22 = vector.contract {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %20, %21, %arg5 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %22 : vector<8x16xf32>
                }
                %18 = vector.transfer_read %14[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, #map3>, vector<8x16xf32>
                %19 = arith.addf %17, %18 : vector<8x16xf32>
                vector.transfer_write %19, %arg3[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, memref<16x16xf32, #map3>
                scf.yield %arg3 : memref<16x16xf32, #map3>
              }
              %16 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel"]} ins(%15 : memref<16x16xf32, #map3>) outs(%16 : memref<16x16xf32, #map3>) {
              ^bb0(%arg2: f32, %arg3: f32):
                linalg.yield %arg2 : f32
              }
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<32x16xf32> in !stream.resource<external>{%c2048}
    %3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c2048}
    %4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c3072}, %1 as %arg4: !stream.resource<external>{%c1536}, %2 as %arg5: !stream.resource<external>{%c2048}, %3 as %arg6: !stream.resource<external>{%c2048}) {
      stream.cmd.dispatch @simple_matmul_tensor_dispatch_0::@simple_matmul_tensor_dispatch_0_matmul_32x16x24[%c32, %c16] {
        ro %arg3[%c0 for %c3072] : !stream.resource<external>{%c3072},
        ro %arg4[%c0 for %c1536] : !stream.resource<external>{%c1536},
        ro %arg5[%c0 for %c2048] : !stream.resource<external>{%c2048},
        wo %arg6[%c0 for %c2048] : !stream.resource<external>{%c2048}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
    } => !stream.timepoint
    %5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c2048}
    %6 = stream.tensor.export %5 : tensor<32x16xf32> in !stream.resource<external>{%c2048} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) ('func.func' operation: @simple_matmul_tensor_dispatch_0_matmul_32x16x24) //----- //
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>], legacy_sync}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map0 = affine_map<()[s0] -> (s0 ceildiv 16)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 24 + s0 + d1)>
#map3 = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map7 = affine_map<(d0, d1) -> (d0, d1)>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPadExpert workload_per_wg = [16, 16]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @simple_matmul_tensor_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_matmul_tensor_dispatch_0_matmul_32x16x24 ordinal(0) layout(#executable_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map0()[%arg1]
        %1 = affine.apply #map0()[%arg2]
        hal.return %1, %0, %c1 : index, index, index
      }
      builtin.module {
        func.func @simple_matmul_tensor_dispatch_0_matmul_32x16x24() {
          %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
          %c24 = arith.constant 24 : index
          %c12 = arith.constant 12 : index
          %c8 = arith.constant 8 : index
          %c0 = arith.constant 0 : index
          %c32 = arith.constant 32 : index
          %c16 = arith.constant 16 : index
          %cst_0 = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<32x24xf32>
          memref.assume_alignment %0, 64 : memref<32x24xf32>
          %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x24xf32>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<24x16xf32>
          memref.assume_alignment %2, 64 : memref<24x16xf32>
          %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:24x16xf32>
          %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
          memref.assume_alignment %4, 64 : memref<32x16xf32>
          %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:32x16xf32>
          %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : memref<32x16xf32>
          memref.assume_alignment %6, 64 : memref<32x16xf32>
          %7 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:32x16xf32>
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %8 = affine.apply #map1()[%workgroup_id_y]
          %9 = affine.apply #map1()[%workgroup_count_y]
          %10 = affine.apply #map1()[%workgroup_id_x]
          %11 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %8 to %c32 step %9 {
            %12 = memref.subview %0[%arg0, 0] [16, 24] [1, 1] : memref<32x24xf32> to memref<16x24xf32, #map2>
            scf.for %arg1 = %10 to %c16 step %11 {
              %13 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              %14 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              scf.for %arg2 = %c0 to %c16 step %c8 {
                %16 = scf.for %arg3 = %c0 to %c24 step %c12 iter_args(%arg4 = %cst) -> (vector<8x16xf32>) {
                  %19 = vector.transfer_read %12[%arg2, %arg3], %cst_0 {in_bounds = [true, true]} : memref<16x24xf32, #map2>, vector<8x12xf32>
                  %20 = vector.transfer_read %2[%arg3, %c0], %cst_0 {in_bounds = [true, true]} : memref<24x16xf32>, vector<12x16xf32>
                  %21 = vector.contract {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %19, %20, %arg4 : vector<8x12xf32>, vector<12x16xf32> into vector<8x16xf32>
                  scf.yield %21 : vector<8x16xf32>
                }
                %17 = vector.transfer_read %14[%arg2, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, #map3>, vector<8x16xf32>
                %18 = arith.addf %16, %17 : vector<8x16xf32>
                vector.transfer_write %18, %13[%arg2, %c0] {in_bounds = [true, true]} : vector<8x16xf32>, memref<16x16xf32, #map3>
              }
              %15 = memref.subview %6[%arg0, %arg1] [16, 16] [1, 1] : memref<32x16xf32> to memref<16x16xf32, #map3>
              linalg.generic {indexing_maps = [#map7, #map7], iterator_types = ["parallel", "parallel"]} ins(%13 : memref<16x16xf32, #map3>) outs(%15 : memref<16x16xf32, #map3>) {
              ^bb0(%arg2: f32, %arg3: f32):
                linalg.yield %arg2 : f32
              }
            }
          }
          return
        }
      }
    }
  }
  func.func @simple_matmul_tensor(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c3072 = arith.constant 3072 : index
    %c1536 = arith.constant 1536 : index
    %c2048 = arith.constant 2048 : index
    %c32 = arith.constant 32 : index
    %c16 = arith.constant 16 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c24 = arith.constant 24 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c32, %c24]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<32x24xf32> in !stream.resource<external>{%c3072}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c24, %c16]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<24x16xf32> in !stream.resource<external>{%c1536}
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c32, %c16]) type(%c553648160_i32) encoding(%c