Created
September 14, 2023 19:01
-
-
Save bjacob/0242746ba4b90c6c8a0314326b789f80 to your computer and use it in GitHub Desktop.
FuseDequantizationMatmul
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After GeneralizeLinalgNamedOps (iree-flow-generalize-linalg-named-ops) //----- // | |
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4> | |
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32> | |
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32> | |
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32> | |
%4 = tensor.empty() : tensor<1x1x11008xf32> | |
%5 = tensor.empty() : tensor<11008x32x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0, %1, %2 : tensor<11008x32x128xi4>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<11008x32x128xf32>) { | |
^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): | |
%10 = arith.extui %in : i4 to i32 | |
%11 = arith.uitofp %10 : i32 to f32 | |
%12 = arith.subf %11, %in_1 : f32 | |
%13 = arith.mulf %12, %in_0 : f32 | |
linalg.yield %13 : f32 | |
} -> tensor<11008x32x128xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%3, %7 : tensor<1x1x32x128xf32>, tensor<11008x32x128xf32>) outs(%6 : tensor<1x1x11008xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%10 = arith.mulf %in, %in_0 : f32 | |
%11 = arith.addf %10, %out : f32 | |
linalg.yield %11 : f32 | |
} -> tensor<1x1x11008xf32> | |
%9 = hal.tensor.export %8 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view | |
return %9 : !hal.buffer_view | |
} | |
// -----// IR Dump After FuseDequantizationMatmul (iree-flow-fuse-dequantization-matmul) //----- // | |
func.func @quantized_matmul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} { | |
%cst = arith.constant 1.270000e+02 : f32 | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c0_i16 = arith.constant 0 : i16 | |
%0 = hal.tensor.import %arg0 "input 0" : !hal.buffer_view -> tensor<11008x32x128xi4> | |
%1 = hal.tensor.import %arg1 "input 1" : !hal.buffer_view -> tensor<11008x32x1xf32> | |
%2 = hal.tensor.import %arg2 "input 2" : !hal.buffer_view -> tensor<11008x32x1xf32> | |
%3 = hal.tensor.import %arg3 "input 3" : !hal.buffer_view -> tensor<1x1x32x128xf32> | |
%4 = tensor.empty() : tensor<1x1x11008xf32> | |
%5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<1x1x11008xf32>) -> tensor<1x1x11008xf32> | |
%6 = tensor.empty() : tensor<1x1x32xf32> | |
%7 = linalg.fill ins(%cst_0 : f32) outs(%6 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32> | |
%8 = tensor.empty() : tensor<1x1x32xf32> | |
%9 = tensor.empty() : tensor<1x1x32xf32> | |
%10 = linalg.fill ins(%cst_0 : f32) outs(%9 : tensor<1x1x32xf32>) -> tensor<1x1x32xf32> | |
%11 = tensor.empty() : tensor<1x1x32x128xi8> | |
%12:3 = flow.dispatch.region -> (tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32>) { | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%7 : tensor<1x1x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%21 = math.absf %in : f32 | |
%22 = arith.maxf %21, %out : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<1x1x32xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<1x1x32xf32>) outs(%8 : tensor<1x1x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%21 = arith.divf %in, %cst : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x32xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3 : tensor<1x1x32x128xf32>) outs(%10 : tensor<1x1x32xf32>) { | |
^bb0(%in: f32, %out: f32): | |
%21 = arith.addf %in, %out : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x32xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %18 : tensor<1x1x32x128xf32>, tensor<1x1x32xf32>) outs(%11 : tensor<1x1x32x128xi8>) { | |
^bb0(%in: f32, %in_1: f32, %out: i8): | |
%21 = arith.divf %in, %in_1 : f32 | |
%22 = arith.fptoui %21 : f32 to i8 | |
linalg.yield %22 : i8 | |
} -> tensor<1x1x32x128xi8> | |
flow.return %20, %19, %18 : tensor<1x1x32x128xi8>, tensor<1x1x32xf32>, tensor<1x1x32xf32> | |
} | |
%13 = tensor.empty() : tensor<1x1x11008x32xi16> | |
%14 = linalg.fill ins(%c0_i16 : i16) outs(%13 : tensor<1x1x11008x32xi16>) -> tensor<1x1x11008x32xi16> | |
%15 = flow.dispatch.region -> (tensor<1x1x11008xf32>) { | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%12#0, %0 : tensor<1x1x32x128xi8>, tensor<11008x32x128xi4>) outs(%14 : tensor<1x1x11008x32xi16>) { | |
^bb0(%in: i8, %in_1: i4, %out: i16): | |
%19 = arith.extsi %in : i8 to i16 | |
%20 = arith.extui %in_1 : i4 to i16 | |
%21 = arith.muli %19, %20 : i16 | |
%22 = arith.addi %21, %out : i16 | |
linalg.yield %22 : i16 | |
} -> tensor<1x1x11008x32xi16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d2, d3, 0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%17, %12#2, %12#1, %1, %2 : tensor<1x1x11008x32xi16>, tensor<1x1x32xf32>, tensor<1x1x32xf32>, tensor<11008x32x1xf32>, tensor<11008x32x1xf32>) outs(%5 : tensor<1x1x11008xf32>) { | |
^bb0(%in: i16, %in_1: f32, %in_2: f32, %in_3: f32, %in_4: f32, %out: f32): | |
%19 = arith.extsi %in : i16 to i32 | |
%20 = arith.sitofp %19 : i32 to f32 | |
%21 = arith.mulf %20, %in_1 : f32 | |
%22 = arith.mulf %21, %in_3 : f32 | |
%23 = arith.mulf %in_4, %in_3 : f32 | |
%24 = arith.mulf %23, %in_2 : f32 | |
%25 = arith.subf %22, %24 : f32 | |
%26 = arith.addf %25, %out : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<1x1x11008xf32> | |
flow.return %18 : tensor<1x1x11008xf32> | |
} | |
%16 = hal.tensor.export %15 "output 0" : tensor<1x1x11008xf32> -> !hal.buffer_view | |
return %16 : !hal.buffer_view | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment