Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created September 14, 2023 19:01
Show Gist options
  • Save bjacob/0242746ba4b90c6c8a0314326b789f80 to your computer and use it in GitHub Desktop.
Save bjacob/0242746ba4b90c6c8a0314326b789f80 to your computer and use it in GitHub Desktop.
FuseDequantizationMatmul
// -----// IR Dump After GeneralizeLinalgNamedOps (iree-flow-generalize-linalg-named-ops) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
%4 = tensor.empty() : tensor<1x1x11008xf32>
%5 = tensor.empty() : tensor<11008x32x128xf32>
%6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %1, %2 : tensor<11008x32x128xi4>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<11008x32x128xf32>) {
^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
%10 = arith.extui %in : i4 to i32
%11 = arith.uitofp %10 : i32 to f32
%12 = arith.subf %11, %in_1 : f32
%13 = arith.mulf %12, %in_0 : f32
linalg.yield %13 : f32
} -> tensor<11008x32x128xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<1x1x32x128xf32>, tensor<11008x32x128xf32>) outs(%6 : tensor<1x1x11008xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%10 = arith.mulf %in, %in_0 : f32
%11 = arith.addf %10, %out : f32
linalg.yield %11 : f32
} -> tensor<1x1x11008xf32>
%9 = hal.tensor.export %8 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
return %9 : !hal.buffer_view
}
// -----// IR Dump After FuseDequantizationMatmul (iree-flow-fuse-dequantization-matmul) //----- //
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%cst = arith.constant 1.270000e+02 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%c0_i16 = arith.constant 0 : i16
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4>
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32>
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32>
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32>
%4 = tensor.empty() : tensor<1x1x11008xf32>
%5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32>
%6 = tensor.empty() : tensor<1x1x32xf32>
%7 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32>
%8 = tensor.empty() : tensor<1x1x32xf32>
%9 = tensor.empty() : tensor<1x1x32xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32>
%11 = tensor.empty() : tensor<1x1x32x128xi8>
%12:3 = flow.dispatch.region -> (tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32>) {
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%7 : tensor<1x1x32xf32>) {
^bb0(%in: f32, %out: f32):
%21 = math.absf %in : f32
%22 = arith.maxf %21, %out : f32
linalg.yield %22 : f32
} -> tensor<1x1x32xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<1x1x32xf32>) outs(%8 : tensor<1x1x32xf32>) {
^bb0(%in: f32, %out: f32):
%21 = arith.divf %in, %cst : f32
linalg.yield %21 : f32
} -> tensor<1x1x32xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%10 : tensor<1x1x32xf32>) {
^bb0(%in: f32, %out: f32):
%21 = arith.addf %in, %out : f32
linalg.yield %21 : f32
} -> tensor<1x1x32xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %18 : tensor<1x1x32x128xf32>, tensor<1x1x32xf32>) outs(%11 : tensor<1x1x32x128xi8>) {
^bb0(%in: f32, %in_1: f32, %out: i8):
%21 = arith.divf %in, %in_1 : f32
%22 = arith.fptoui %21 : f32 to i8
linalg.yield %22 : i8
} -> tensor<1x1x32x128xi8>
flow.return %20, %19, %18 : tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32>
}
%13 = tensor.empty() : tensor<1x1x11008x32xi16>
%14 = linalg.fill ins(%c0_i16 : i16) outs(%13 : tensor<1x1x11008x32xi16>) -> tensor<1x1x11008x32xi16>
%15 = flow.dispatch.region -> (tensor<1x1x11008xf32>) {
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%12#0, %0 : tensor<1x1x32x128xi8>, tensor<11008x32x128xi4>) outs(%14 : tensor<1x1x11008x32xi16>) {
^bb0(%in: i8, %in_1: i4, %out: i16):
%19 = arith.extsi %in : i8 to i16
%20 = arith.extui %in_1 : i4 to i16
%21 = arith.muli %19, %20 : i16
%22 = arith.addi %21, %out : i16
linalg.yield %22 : i16
} -> tensor<1x1x11008x32xi16>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%17, %12#2, %12#1, %1, %2 : tensor<1x1x11008x32xi16>, tensor<1x1x32xf32>, tensor<1x1x32xf32>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<1x1x11008xf32>) {
^bb0(%in: i16, %in_1: f32, %in_2: f32, %in_3: f32, %in_4: f32, %out: f32):
%19 = arith.extsi %in : i16 to i32
%20 = arith.sitofp %19 : i32 to f32
%21 = arith.mulf %20, %in_1 : f32
%22 = arith.mulf %21, %in_3 : f32
%23 = arith.mulf %in_4, %in_3 : f32
%24 = arith.mulf %23, %in_2 : f32
%25 = arith.subf %22, %24 : f32
%26 = arith.addf %25, %out : f32
linalg.yield %26 : f32
} -> tensor<1x1x11008xf32>
flow.return %18 : tensor<1x1x11008xf32>
}
%16 = hal.tensor.export %15 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view
return %16 : !hal.buffer_view
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment