Max191

## afterCollapseDims.mlir
// When running with CollapseDims pass
// Dims on matmul are collapsed, but not on dequantization, causing the ops to not get fused in tiling

// -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- //
func.func @something(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<0.000000e+00> : tensor<4096x32xf32>
  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<4096x32x128xi8>
  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<1x32x128xf32>
  %2 = util.optimization_barrier %cst_0 : tensor<4096x32xf32>

## quantized_matmul.mlir
builtin.module {
  func.func @quantized_matmul(%arg0: tensor<11008x32x128xi8>, %arg1: tensor<11008x32x1xf32>, %arg2: tensor<11008x32x1xf32>, %arg3: tensor<1x1x32x128xf32>) -> tensor<1x1x11008xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %4 = tensor.empty() : tensor<1x1x11008xf32>
    %5 = tensor.empty() : tensor<11008x32x128xf32>
    %6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>

    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0, %arg1, %arg2 : tensor<11008x32x128xi8>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<11008x32x128xf32>) {
    ^bb0(%in: i8, %in_0: f32, %in_1: f32, %out: f32):
      %9 = arith.extui %in : i8 to i32

## fusion_results.mlir

// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
  %4 = tensor.empty() : tensor<11008x32x128xf32>
  %collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>

## file1_fuse_dequant_matmul.mlir

// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
  %1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
  %3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
  %4 = tensor.empty() : tensor<11008x32x128xf32>
  %collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>

## gist:750a9d7e325e4fe3a712a193382ba86c

// -----// IR Dump After TileAndDecomposeWinogradTransform (iree-linalg-ext-tile-and-decompose-winograd) //----- //
func.func @quantized_matmul_dispatch_3_generic_11008x32x128_i16xi8xi32() {
  %c11008 = arith.constant 11008 : index
  %c256 = arith.constant 256 : index
  %c0 = arith.constant 0 : index
  %c128 = arith.constant 128 : index
  %c0_i32 = arith.constant 0 : i32
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c256) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xi16>>

## llama2_cpu_perf_mlir.patch
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 7f8322bd5f6f..762de3b99494 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -460,6 +460,10 @@ LogicalResult LoadOpOfExpandShapeOpFolder<OpTy>::matchAndRewrite(
         rewriter.replaceOpWithNewOp<decltype(op)>(
             loadOp, expandShapeOp.getViewSource(), sourceIndices);
       })
+      .Case([&](vector::LoadOp op) {
+        rewriter.replaceOpWithNewOp<vector::LoadOp>(

## reassociated_grouped_quantized_matmul.mlir
func.func @reassociated_grouped_quantized_matmul(%vec: tensor<32x128xi16>, %mat: tensor<11008x32x128xi4>, %vec_scales: tensor<32xf32>, %vec_scaled_sums: tensor<32xf32>, %mat_scales: tensor<11008x32xf32>, %mat_zps: tensor<11008x32xf32>) -> tensor<11008xf32> {
  %c0_i32 = arith.constant 0 : i32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = tensor.empty() : tensor<11008x32xi32>
  %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<11008x32xi32>) -> tensor<11008x32xi32>
  %2 = tensor.empty() : tensor<11008xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<11008xf32>) -> tensor<11008xf32>
  %batch_matmul_result = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
                                                          affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
                                                          affine_map<(d0, d1, d2) -> (d0, d1)>],

## gist:f9705764cc3cd650f3c547071dcc03a9
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)>
#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>
module {
  func.func @concat_batchMM(%arg0: tensor<1x?x32x128xf32>, %arg1: tensor<1x32x1x128xf32>, %arg2: tensor<1x32x1x128xf32>) -> (tensor<1x32x1x?xf32>, tensor<1x?x32x128xf32>) {
    %cst = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %0 = tensor.empty() : tensor<1x1x32x128xf32>

## gist:908486a43bd86c83d865d7d25face75f
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
module {
  func.func @concat_batchMM(%arg0: tensor<?x32x128xf32>, %arg1: tensor<32x1x128xf32>) -> (tensor<32x1x?xf32>, tensor<?x32x128xf32>) {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %arg0, %c0 : tensor<?x32x128xf32>
    %0 = tensor.empty(%dim) : tensor<32x?x128xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor<?x32x128xf32>) outs(%0 : tensor<32x?x128xf32>) {
    ^bb0(%in: f32, %out: f32):

## gist:0b515716d65478d0e9fad83673c5a616
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
module{
  util.global private @cst = #util.byte_pattern<1> : tensor<11008x32x128xi4>
  util.global private mutable @global_seed = #util.byte_pattern<2> : tensor<i64>
  func.func @transpose_extend_batch_matmul(%arg0: tensor<32x128xi16>) -> tensor<11008x32xi32> {
    %cst = util.global.load @cst : tensor<11008x32x128xi4>
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<11008x32xi32>
	// When running with CollapseDims pass
	// Dims on matmul are collapsed, but not on dequantization, causing the ops to not get fused in tiling

	// -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- //
	func.func @something(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
	%cst = arith.constant 0.000000e+00 : f32
	%cst_0 = arith.constant dense<0.000000e+00> : tensor<4096x32xf32>
	%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<4096x32x128xi8>
	%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<1x32x128xf32>
	%2 = util.optimization_barrier %cst_0 : tensor<4096x32xf32>
	builtin.module {
	func.func @quantized_matmul(%arg0: tensor<11008x32x128xi8>, %arg1: tensor<11008x32x1xf32>, %arg2: tensor<11008x32x1xf32>, %arg3: tensor<1x1x32x128xf32>) -> tensor<1x1x11008xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%4 = tensor.empty() : tensor<1x1x11008xf32>
	%5 = tensor.empty() : tensor<11008x32x128xf32>
	%6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>

	%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0, %arg1, %arg2 : tensor<11008x32x128xi8>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<11008x32x128xf32>) {
	^bb0(%in: i8, %in_0: f32, %in_1: f32, %out: f32):
	%9 = arith.extui %in : i8 to i32

	// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
	func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
	%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
	%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
	%4 = tensor.empty() : tensor<11008x32x128xf32>
	%collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>

	// -----// IR Dump After TileAndDecomposeWinogradTransform (iree-linalg-ext-tile-and-decompose-winograd) //----- //
	func.func @quantized_matmul_dispatch_3_generic_11008x32x128_i16xi8xi32() {
	%c11008 = arith.constant 11008 : index
	%c256 = arith.constant 256 : index
	%c0 = arith.constant 0 : index
	%c128 = arith.constant 128 : index
	%c0_i32 = arith.constant 0 : i32
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c256) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xi16>>
	diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
	index 7f8322bd5f6f..762de3b99494 100644
	--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
	+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
	@@ -460,6 +460,10 @@ LogicalResult LoadOpOfExpandShapeOpFolder<OpTy>::matchAndRewrite(
	rewriter.replaceOpWithNewOp<decltype(op)>(
	loadOp, expandShapeOp.getViewSource(), sourceIndices);
	})
	+ .Case([&](vector::LoadOp op) {
	+ rewriter.replaceOpWithNewOp<vector::LoadOp>(
	func.func @reassociated_grouped_quantized_matmul(%vec: tensor<32x128xi16>, %mat: tensor<11008x32x128xi4>, %vec_scales: tensor<32xf32>, %vec_scaled_sums: tensor<32xf32>, %mat_scales: tensor<11008x32xf32>, %mat_zps: tensor<11008x32xf32>) -> tensor<11008xf32> {
	%c0_i32 = arith.constant 0 : i32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<11008x32xi32>
	%1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<11008x32xi32>) -> tensor<11008x32xi32>
	%2 = tensor.empty() : tensor<11008xf32>
	%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<11008xf32>) -> tensor<11008xf32>
	%batch_matmul_result = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
	affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
	affine_map<(d0, d1, d2) -> (d0, d1)>],
	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
	#map2 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
	#map3 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)>
	#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>
	module {
	func.func @concat_batchMM(%arg0: tensor<1x?x32x128xf32>, %arg1: tensor<1x32x1x128xf32>, %arg2: tensor<1x32x1x128xf32>) -> (tensor<1x32x1x?xf32>, tensor<1x?x32x128xf32>) {
	%cst = arith.constant 0.000000e+00 : f32
	%c1 = arith.constant 1 : index
	%0 = tensor.empty() : tensor<1x1x32x128xf32>
	#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
	module {
	func.func @concat_batchMM(%arg0: tensor<?x32x128xf32>, %arg1: tensor<32x1x128xf32>) -> (tensor<32x1x?xf32>, tensor<?x32x128xf32>) {
	%cst = arith.constant 0.000000e+00 : f32
	%c0 = arith.constant 0 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?x32x128xf32>
	%0 = tensor.empty(%dim) : tensor<32x?x128xf32>
	%1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor<?x32x128xf32>) outs(%0 : tensor<32x?x128xf32>) {
	^bb0(%in: f32, %out: f32):
	#map = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
	module{
	util.global private @cst = #util.byte_pattern<1> : tensor<11008x32x128xi4>
	util.global private mutable @global_seed = #util.byte_pattern<2> : tensor<i64>
	func.func @transpose_extend_batch_matmul(%arg0: tensor<32x128xi16>) -> tensor<11008x32xi32> {
	%cst = util.global.load @cst : tensor<11008x32x128xi4>
	%c0_i32 = arith.constant 0 : i32
	%0 = tensor.empty() : tensor<11008x32xi32>