Skip to content

Instantly share code, notes, and snippets.

@Max191
Created September 14, 2023 19:54
Show Gist options
  • Save Max191/ae5f79c327dda99bafd273799f0a5cbe to your computer and use it in GitHub Desktop.
Save Max191/ae5f79c327dda99bafd273799f0a5cbe to your computer and use it in GitHub Desktop.
FuseDequantizationMatmul
// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
%4 = tensor.empty() : tensor<11008x32x128xf32>
%collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
%collapsed_0 = tensor.collapse_shape %2 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %collapsed, %collapsed_0 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%4 : tensor<11008x32x128xf32>) {
^bb0(%in: i4, %in_2: f32, %in_3: f32, %out: f32):
%10 = arith.extui %in : i4 to i32
%11 = arith.uitofp %10 : i32 to f32
%12 = arith.subf %11, %in_3 : f32
%13 = arith.mulf %12, %in_2 : f32
linalg.yield %13 : f32
} -> tensor<11008x32x128xf32>
%collapsed_1 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x1x32x128xf32> into tensor<32x128xf32>
%6 = tensor.empty() : tensor<11008xf32>
%7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<11008xf32>) -> tensor<11008xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%collapsed_1, %5 : tensor<32x128xf32>, tensor<11008x32x128xf32>) outs(%7 : tensor<11008xf32>) {
^bb0(%in: f32, %in_2: f32, %out: f32):
%10 = arith.mulf %in, %in_2 : f32
%11 = arith.addf %10, %out : f32
linalg.yield %11 : f32
} -> tensor<11008xf32>
%expanded = tensor.expand_shape %8 [[0, 1, 2]] : tensor<11008xf32> into tensor<1x1x11008xf32>
%9 = hal.tensor.export %expanded "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
return %9 : !hal.buffer_view
}
// -----// IR Dump After FuseDequantizationMatmul (iree-flow-fuse-dequantization-matmul) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 1.270000e+02 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%c0_i16 = arith.constant 0 : i16
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
%collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
%collapsed_1 = tensor.collapse_shape %2 [[0], [1, 2]] : tensor<11008x32x1xf32> into tensor<11008x32xf32>
%collapsed_2 = tensor.collapse_shape %3 [[0, 1, 2], [3]] : tensor<1x1x32x128xf32> into tensor<32x128xf32>
%4 = tensor.empty() : tensor<11008xf32>
%5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<11008xf32>) -> tensor<11008xf32>
%6 = tensor.empty() : tensor<32xf32>
%7 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<32xf32>) -> tensor<32xf32>
%8 = tensor.empty() : tensor<32xf32>
%9 = tensor.empty() : tensor<32xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<32xf32>) -> tensor<32xf32>
%11 = tensor.empty() : tensor<32x128xi8>
%12:3 = flow.dispatch.region -> (tensor<32x128xi8>, tensor<32xf32>, tensor<32xf32>) {
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed_2 : tensor<32x128xf32>) outs(%7 : tensor<32xf32>) {
^bb0(%in: f32, %out: f32):
%21 = math.absf %in : f32
%22 = arith.maxf %21, %out : f32
linalg.yield %22 : f32
} -> tensor<32xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%17 : tensor<32xf32>) outs(%8 : tensor<32xf32>) {
^bb0(%in: f32, %out: f32):
%21 = arith.divf %in, %cst : f32
linalg.yield %21 : f32
} -> tensor<32xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%collapsed_2 : tensor<32x128xf32>) outs(%10 : tensor<32xf32>) {
^bb0(%in: f32, %out: f32):
%21 = arith.addf %in, %out : f32
linalg.yield %21 : f32
} -> tensor<32xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%collapsed_2, %18 : tensor<32x128xf32>, tensor<32xf32>) outs(%11 : tensor<32x128xi8>) {
^bb0(%in: f32, %in_3: f32, %out: i8):
%21 = arith.divf %in, %in_3 : f32
%22 = arith.fptoui %21 : f32 to i8
linalg.yield %22 : i8
} -> tensor<32x128xi8>
flow.return %20, %19, %18 : tensor<32x128xi8>, tensor<32xf32>, tensor<32xf32>
}
%13 = tensor.empty() : tensor<11008x32xi16>
%14 = linalg.fill ins(%c0_i16 : i16) outs(%13 : tensor<11008x32xi16>) -> tensor<11008x32xi16>
%15 = flow.dispatch.region -> (tensor<11008xf32>) {
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%12#0, %0 : tensor<32x128xi8>, tensor<11008x32x128xi4>) outs(%14 : tensor<11008x32xi16>) {
^bb0(%in: i8, %in_3: i4, %out: i16):
%19 = arith.extsi %in : i8 to i16
%20 = arith.extui %in_3 : i4 to i16
%21 = arith.muli %19, %20 : i16
%22 = arith.addi %21, %out : i16
linalg.yield %22 : i16
} -> tensor<11008x32xi16>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%17, %12#2, %12#1, %collapsed, %collapsed_1 : tensor<11008x32xi16>, tensor<32xf32>, tensor<32xf32>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%5 : tensor<11008xf32>) {
^bb0(%in: i16, %in_3: f32, %in_4: f32, %in_5: f32, %in_6: f32, %out: f32):
%19 = arith.extsi %in : i16 to i32
%20 = arith.sitofp %19 : i32 to f32
%21 = arith.mulf %20, %in_3 : f32
%22 = arith.mulf %21, %in_5 : f32
%23 = arith.mulf %in_6, %in_5 : f32
%24 = arith.mulf %23, %in_4 : f32
%25 = arith.subf %22, %24 : f32
%26 = arith.addf %25, %out : f32
linalg.yield %26 : f32
} -> tensor<11008xf32>
flow.return %18 : tensor<11008xf32>
}
%expanded = tensor.expand_shape %15 [[0, 1, 2]] : tensor<11008xf32> into tensor<1x1x11008xf32>
%16 = hal.tensor.export %expanded "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
return %16 : !hal.buffer_view
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment