Skip to content

Instantly share code, notes, and snippets.

@Max191
Max191 / afterCollapseDims.mlir
Last active July 17, 2023 19:31
IR for dequant + matmul fusion compile test
// When running with CollapseDims pass
// Dims on matmul are collapsed, but not on dequantization, causing the ops to not get fused in tiling
// -----// IR Dump After CollapseDims (iree-flow-collapse-dims) //----- //
func.func @something(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant dense<0.000000e+00> : tensor<4096x32xf32>
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<4096x32x128xi8>
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<1x32x128xf32>
%2 = util.optimization_barrier %cst_0 : tensor<4096x32xf32>
@Max191
Max191 / quantized_matmul.mlir
Created September 12, 2023 15:29
IR for reassociation of quantized matmul
builtin.module {
func.func @quantized_matmul(%arg0: tensor<11008x32x128xi8>, %arg1: tensor<11008x32x1xf32>, %arg2: tensor<11008x32x1xf32>, %arg3: tensor<1x1x32x128xf32>) -> tensor<1x1x11008xf32> {
%cst = arith.constant 0.000000e+00 : f32
%4 = tensor.empty() : tensor<1x1x11008xf32>
%5 = tensor.empty() : tensor<11008x32x128xf32>
%6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0, %arg1, %arg2 : tensor<11008x32x128xi8>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<11008x32x128xf32>) {
^bb0(%in: i8, %in_0: f32, %in_1: f32, %out: f32):
%9 = arith.extui %in : i8 to i32
@Max191
Max191 / fusion_results.mlir
Created September 14, 2023 19:54
FuseDequantizationMatmul
// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
%4 = tensor.empty() : tensor<11008x32x128xf32>
%collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
@Max191
Max191 / file1_fuse_dequant_matmul.mlir
Last active September 18, 2023 16:57
Tiling for reassociated quantized matmul
// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
%4 = tensor.empty() : tensor<11008x32x128xf32>
%collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
@Max191
Max191 / gist:750a9d7e325e4fe3a712a193382ba86c
Last active September 26, 2023 20:07
Incorrect tiling of reassociated quantized matmul
// -----// IR Dump After TileAndDecomposeWinogradTransform (iree-linalg-ext-tile-and-decompose-winograd) //----- //
func.func @quantized_matmul_dispatch_3_generic_11008x32x128_i16xi8xi32() {
%c11008 = arith.constant 11008 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c256) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xi16>>
@Max191
Max191 / llama2_cpu_perf_mlir.patch
Created October 7, 2023 13:22
MLIR patch for llama2 CPU performance
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 7f8322bd5f6f..762de3b99494 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -460,6 +460,10 @@ LogicalResult LoadOpOfExpandShapeOpFolder<OpTy>::matchAndRewrite(
rewriter.replaceOpWithNewOp<decltype(op)>(
loadOp, expandShapeOp.getViewSource(), sourceIndices);
})
+ .Case([&](vector::LoadOp op) {
+ rewriter.replaceOpWithNewOp<vector::LoadOp>(
@Max191
Max191 / reassociated_grouped_quantized_matmul.mlir
Created October 11, 2023 20:31
reassociated_grouped_quantized_matmul
func.func @reassociated_grouped_quantized_matmul(%vec: tensor<32x128xi16>, %mat: tensor<11008x32x128xi4>, %vec_scales: tensor<32xf32>, %vec_scaled_sums: tensor<32xf32>, %mat_scales: tensor<11008x32xf32>, %mat_zps: tensor<11008x32xf32>) -> tensor<11008xf32> {
%c0_i32 = arith.constant 0 : i32
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<11008x32xi32>
%1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<11008x32xi32>) -> tensor<11008x32xi32>
%2 = tensor.empty() : tensor<11008xf32>
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<11008xf32>) -> tensor<11008xf32>
%batch_matmul_result = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
@Max191
Max191 / gist:f9705764cc3cd650f3c547071dcc03a9
Created October 16, 2023 14:28
Concat rewrite from llama2 model
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>
#map2 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map3 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)>
#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3, d2)>
module {
func.func @concat_batchMM(%arg0: tensor<1x?x32x128xf32>, %arg1: tensor<1x32x1x128xf32>, %arg2: tensor<1x32x1x128xf32>) -> (tensor<1x32x1x?xf32>, tensor<1x?x32x128xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%0 = tensor.empty() : tensor<1x1x32x128xf32>
@Max191
Max191 / gist:908486a43bd86c83d865d7d25face75f
Created October 16, 2023 20:23
transpose batch_matmul fusion
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
module {
func.func @concat_batchMM(%arg0: tensor<?x32x128xf32>, %arg1: tensor<32x1x128xf32>) -> (tensor<32x1x?xf32>, tensor<?x32x128xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x32x128xf32>
%0 = tensor.empty(%dim) : tensor<32x?x128xf32>
%1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor<?x32x128xf32>) outs(%0 : tensor<32x?x128xf32>) {
^bb0(%in: f32, %out: f32):
@Max191
Max191 / gist:0b515716d65478d0e9fad83673c5a616
Created November 11, 2023 00:45
quantized matmul workload
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
module{
util.global private @cst = #util.byte_pattern<1> : tensor<11008x32x128xi4>
util.global private mutable @global_seed = #util.byte_pattern<2> : tensor<i64>
func.func @transpose_extend_batch_matmul(%arg0: tensor<32x128xi16>) -> tensor<11008x32xi32> {
%cst = util.global.load @cst : tensor<11008x32x128xi4>
%c0_i32 = arith.constant 0 : i32
%0 = tensor.empty() : tensor<11008x32xi32>