diff --git a/dispatch_537.mlir b/dispatch_545.mlir
index 43efe8e..244c54c 100644
--- a/dispatch_537.mlir
+++ b/dispatch_545.mlir
@@ -1,17 +1,17 @@
- flow.executable private @main_dispatch_537 {
- flow.executable.export public @main_dispatch_537_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
+ flow.executable private @main_dispatch_545 {
+ flow.executable.export public @main_dispatch_545_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @main_dispatch_537_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i8, %arg11: i32, %arg12: i8, %arg13: i32, %arg14: i32, %arg15: i8, %arg16: i32, %arg17: i8, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i32, %arg25: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg28: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) {
+ func.func @main_dispatch_545_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i8, %arg12: i32, %arg13: i8, %arg14: i32, %arg15: i32, %arg16: i8, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i8, %arg25: i32, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg28: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg31: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) {
%0 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xi8>> -> tensor<384x512xi8>
%1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xi8>> -> tensor<512x128xi8>
- %2 = flow.dispatch.tensor.load %arg25, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
- %3 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
- %4 = flow.dispatch.tensor.load %arg27, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8>
- %5 = flow.dispatch.tensor.load %arg28, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
- %6 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
+ %2 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
+ %3 = flow.dispatch.tensor.load %arg27, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
+ %4 = flow.dispatch.tensor.load %arg28, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8>
+ %5 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
+ %6 = flow.dispatch.tensor.load %arg30, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
%7 = tensor.empty() : tensor<384x128xi8>
%8 = tensor.empty() : tensor<384x128xi32>
%9 = linalg.fill ins(%arg0 : i32) outs(%8 : tensor<384x128xi32>) -> tensor<384x128xi32>
@@ -26,76 +26,77 @@
%17 = arith.cmpi sgt, %14, %arg7 : i32
%18 = arith.select %17, %arg7, %16 : i32
%19 = arith.extsi %in_2 : i8 to i32
- %20 = "tosa.apply_scale"(%19, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
- %21 = arith.cmpi slt, %20, %arg6 : i32
- %22 = arith.select %21, %arg6, %20 : i32
- %23 = arith.cmpi sgt, %20, %arg7 : i32
- %24 = arith.select %23, %arg7, %22 : i32
- %25 = arith.extsi %in_4 : i8 to i32
- %26 = arith.subi %25, %arg8 : i32
- %27 = "tosa.apply_scale"(%26, %arg9, %arg10) {double_round = false} : (i32, i32, i8) -> i32
- %28 = arith.cmpi slt, %27, %arg6 : i32
- %29 = arith.select %28, %arg6, %27 : i32
- %30 = arith.cmpi sgt, %27, %arg7 : i32
- %31 = arith.select %30, %arg7, %29 : i32
- %32 = arith.muli %24, %18 : i32
- %33 = "tosa.apply_scale"(%32, %arg11, %arg12) {double_round = true} : (i32, i32, i8) -> i32
- %34 = arith.addi %33, %arg13 : i32
- %35 = arith.cmpi slt, %34, %arg3 : i32
- %36 = arith.select %35, %arg3, %34 : i32
- %37 = arith.cmpi sgt, %34, %arg14 : i32
- %38 = arith.select %37, %arg14, %36 : i32
- %39 = arith.trunci %38 : i32 to i8
- %40 = arith.extsi %39 : i8 to i32
- %41 = arith.subi %40, %arg13 : i32
- %42 = "tosa.apply_scale"(%41, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32
- %43 = arith.cmpi slt, %42, %arg6 : i32
- %44 = arith.select %43, %arg6, %42 : i32
- %45 = arith.cmpi sgt, %42, %arg7 : i32
- %46 = arith.select %45, %arg7, %44 : i32
- %47 = arith.addi %46, %31 : i32
- %48 = arith.muli %in_1, %arg3 : i32
- %49 = arith.subi %in_0, %48 : i32
- %50 = arith.addi %in, %49 : i32
- %51 = "tosa.apply_scale"(%50, %arg16, %arg17) {double_round = true} : (i32, i32, i8) -> i32
- %52 = arith.addi %51, %arg18 : i32
- %53 = arith.cmpi slt, %52, %arg3 : i32
- %54 = arith.select %53, %arg3, %52 : i32
- %55 = arith.cmpi sgt, %52, %arg14 : i32
- %56 = arith.select %55, %arg14, %54 : i32
- %57 = arith.trunci %56 : i32 to i8
- %58 = "tosa.apply_scale"(%47, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32
- %59 = arith.addi %58, %arg8 : i32
- %60 = arith.cmpi slt, %59, %arg3 : i32
- %61 = arith.select %60, %arg3, %59 : i32
- %62 = arith.cmpi sgt, %59, %arg14 : i32
- %63 = arith.select %62, %arg14, %61 : i32
- %64 = arith.trunci %63 : i32 to i8
- %65 = arith.extsi %64 : i8 to i32
- %66 = arith.subi %65, %arg8 : i32
- %67 = "tosa.apply_scale"(%66, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32
- %68 = arith.cmpi slt, %67, %arg6 : i32
- %69 = arith.select %68, %arg6, %67 : i32
- %70 = arith.cmpi sgt, %67, %arg7 : i32
- %71 = arith.select %70, %arg7, %69 : i32
- %72 = arith.extsi %57 : i8 to i32
- %73 = arith.subi %72, %arg18 : i32
- %74 = "tosa.apply_scale"(%73, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32
- %75 = arith.cmpi slt, %74, %arg6 : i32
- %76 = arith.select %75, %arg6, %74 : i32
- %77 = arith.cmpi sgt, %74, %arg7 : i32
- %78 = arith.select %77, %arg7, %76 : i32
- %79 = arith.addi %78, %71 : i32
- %80 = "tosa.apply_scale"(%79, %arg23, %arg20) {double_round = true} : (i32, i32, i8) -> i32
- %81 = arith.addi %80, %arg24 : i32
- %82 = arith.cmpi slt, %81, %arg3 : i32
- %83 = arith.select %82, %arg3, %81 : i32
- %84 = arith.cmpi sgt, %81, %arg14 : i32
- %85 = arith.select %84, %arg14, %83 : i32
- %86 = arith.trunci %85 : i32 to i8
- linalg.yield %86 : i8
+ %20 = arith.subi %19, %arg8 : i32
+ %21 = "tosa.apply_scale"(%20, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
+ %22 = arith.cmpi slt, %21, %arg6 : i32
+ %23 = arith.select %22, %arg6, %21 : i32
+ %24 = arith.cmpi sgt, %21, %arg7 : i32
+ %25 = arith.select %24, %arg7, %23 : i32
+ %26 = arith.extsi %in_4 : i8 to i32
+ %27 = arith.subi %26, %arg9 : i32
+ %28 = "tosa.apply_scale"(%27, %arg10, %arg11) {double_round = false} : (i32, i32, i8) -> i32
+ %29 = arith.cmpi slt, %28, %arg6 : i32
+ %30 = arith.select %29, %arg6, %28 : i32
+ %31 = arith.cmpi sgt, %28, %arg7 : i32
+ %32 = arith.select %31, %arg7, %30 : i32
+ %33 = arith.muli %25, %18 : i32
+ %34 = "tosa.apply_scale"(%33, %arg12, %arg13) {double_round = true} : (i32, i32, i8) -> i32
+ %35 = arith.addi %34, %arg14 : i32
+ %36 = arith.cmpi slt, %35, %arg3 : i32
+ %37 = arith.select %36, %arg3, %35 : i32
+ %38 = arith.cmpi sgt, %35, %arg15 : i32
+ %39 = arith.select %38, %arg15, %37 : i32
+ %40 = arith.trunci %39 : i32 to i8
+ %41 = arith.extsi %40 : i8 to i32
+ %42 = arith.subi %41, %arg14 : i32
+ %43 = "tosa.apply_scale"(%42, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32
+ %44 = arith.cmpi slt, %43, %arg6 : i32
+ %45 = arith.select %44, %arg6, %43 : i32
+ %46 = arith.cmpi sgt, %43, %arg7 : i32
+ %47 = arith.select %46, %arg7, %45 : i32
+ %48 = arith.addi %47, %32 : i32
+ %49 = arith.muli %in_1, %arg3 : i32
+ %50 = arith.subi %in_0, %49 : i32
+ %51 = arith.addi %in, %50 : i32
+ %52 = "tosa.apply_scale"(%51, %arg17, %arg13) {double_round = true} : (i32, i32, i8) -> i32
+ %53 = arith.addi %52, %arg18 : i32
+ %54 = arith.cmpi slt, %53, %arg3 : i32
+ %55 = arith.select %54, %arg3, %53 : i32
+ %56 = arith.cmpi sgt, %53, %arg15 : i32
+ %57 = arith.select %56, %arg15, %55 : i32
+ %58 = arith.trunci %57 : i32 to i8
+ %59 = "tosa.apply_scale"(%48, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32
+ %60 = arith.addi %59, %arg8 : i32
+ %61 = arith.cmpi slt, %60, %arg3 : i32
+ %62 = arith.select %61, %arg3, %60 : i32
+ %63 = arith.cmpi sgt, %60, %arg15 : i32
+ %64 = arith.select %63, %arg15, %62 : i32
+ %65 = arith.trunci %64 : i32 to i8
+ %66 = arith.extsi %65 : i8 to i32
+ %67 = arith.subi %66, %arg8 : i32
+ %68 = "tosa.apply_scale"(%67, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32
+ %69 = arith.cmpi slt, %68, %arg6 : i32
+ %70 = arith.select %69, %arg6, %68 : i32
+ %71 = arith.cmpi sgt, %68, %arg7 : i32
+ %72 = arith.select %71, %arg7, %70 : i32
+ %73 = arith.extsi %58 : i8 to i32
+ %74 = arith.subi %73, %arg18 : i32
+ %75 = "tosa.apply_scale"(%74, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32
+ %76 = arith.cmpi slt, %75, %arg6 : i32
+ %77 = arith.select %76, %arg6, %75 : i32
+ %78 = arith.cmpi sgt, %75, %arg7 : i32
+ %79 = arith.select %78, %arg7, %77 : i32
+ %80 = arith.addi %79, %72 : i32
+ %81 = "tosa.apply_scale"(%80, %arg23, %arg24) {double_round = true} : (i32, i32, i8) -> i32
+ %82 = arith.addi %81, %arg25 : i32
+ %83 = arith.cmpi slt, %82, %arg3 : i32
+ %84 = arith.select %83, %arg3, %82 : i32
+ %85 = arith.cmpi sgt, %82, %arg15 : i32
+ %86 = arith.select %85, %arg15, %84 : i32
+ %87 = arith.trunci %86 : i32 to i8
+ linalg.yield %87 : i8
} -> tensor<384x128xi8>
- flow.dispatch.tensor.store %11, %arg30, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>
+ flow.dispatch.tensor.store %11, %arg31, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>
return
}
}
-
-
Save ScottTodd/a64c217dd9aa5b45da3028b441f70f88 to your computer and use it in GitHub Desktop.
mobilebert-baseline-tf2-quant_noinlining_flow diffs 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
flow.executable private @main_dispatch_537 { | |
flow.executable.export public @main_dispatch_537_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_dispatch_537_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i8, %arg11: i32, %arg12: i8, %arg13: i32, %arg14: i32, %arg15: i8, %arg16: i32, %arg17: i8, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i32, %arg25: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg28: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) { | |
%0 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xi8>> -> tensor<384x512xi8> | |
%1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xi8>> -> tensor<512x128xi8> | |
%2 = flow.dispatch.tensor.load %arg25, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32> | |
%3 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32> | |
%4 = flow.dispatch.tensor.load %arg27, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8> | |
%5 = flow.dispatch.tensor.load %arg28, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8> | |
%6 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8> | |
%7 = tensor.empty() : tensor<384x128xi8> | |
%8 = tensor.empty() : tensor<384x128xi32> | |
%9 = linalg.fill ins(%arg0 : i32) outs(%8 : tensor<384x128xi32>) -> tensor<384x128xi32> | |
%10 = linalg.matmul ins(%0, %1 : tensor<384x512xi8>, tensor<512x128xi8>) outs(%9 : tensor<384x128xi32>) -> tensor<384x128xi32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %10, %3, %4, %5, %6 : tensor<128xi32>, tensor<384x128xi32>, tensor<128xi32>, tensor<384x128xi8>, tensor<128xi8>, tensor<128xi8>) outs(%7 : tensor<384x128xi8>) { | |
^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i8, %in_3: i8, %in_4: i8, %out: i8): | |
%12 = arith.extsi %in_3 : i8 to i32 | |
%13 = arith.subi %12, %arg3 : i32 | |
%14 = "tosa.apply_scale"(%13, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32 | |
%15 = arith.cmpi slt, %14, %arg6 : i32 | |
%16 = arith.select %15, %arg6, %14 : i32 | |
%17 = arith.cmpi sgt, %14, %arg7 : i32 | |
%18 = arith.select %17, %arg7, %16 : i32 | |
%19 = arith.extsi %in_2 : i8 to i32 | |
%20 = "tosa.apply_scale"(%19, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32 | |
%21 = arith.cmpi slt, %20, %arg6 : i32 | |
%22 = arith.select %21, %arg6, %20 : i32 | |
%23 = arith.cmpi sgt, %20, %arg7 : i32 | |
%24 = arith.select %23, %arg7, %22 : i32 | |
%25 = arith.extsi %in_4 : i8 to i32 | |
%26 = arith.subi %25, %arg8 : i32 | |
%27 = "tosa.apply_scale"(%26, %arg9, %arg10) {double_round = false} : (i32, i32, i8) -> i32 | |
%28 = arith.cmpi slt, %27, %arg6 : i32 | |
%29 = arith.select %28, %arg6, %27 : i32 | |
%30 = arith.cmpi sgt, %27, %arg7 : i32 | |
%31 = arith.select %30, %arg7, %29 : i32 | |
%32 = arith.muli %24, %18 : i32 | |
%33 = "tosa.apply_scale"(%32, %arg11, %arg12) {double_round = true} : (i32, i32, i8) -> i32 | |
%34 = arith.addi %33, %arg13 : i32 | |
%35 = arith.cmpi slt, %34, %arg3 : i32 | |
%36 = arith.select %35, %arg3, %34 : i32 | |
%37 = arith.cmpi sgt, %34, %arg14 : i32 | |
%38 = arith.select %37, %arg14, %36 : i32 | |
%39 = arith.trunci %38 : i32 to i8 | |
%40 = arith.extsi %39 : i8 to i32 | |
%41 = arith.subi %40, %arg13 : i32 | |
%42 = "tosa.apply_scale"(%41, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32 | |
%43 = arith.cmpi slt, %42, %arg6 : i32 | |
%44 = arith.select %43, %arg6, %42 : i32 | |
%45 = arith.cmpi sgt, %42, %arg7 : i32 | |
%46 = arith.select %45, %arg7, %44 : i32 | |
%47 = arith.addi %46, %31 : i32 | |
%48 = arith.muli %in_1, %arg3 : i32 | |
%49 = arith.subi %in_0, %48 : i32 | |
%50 = arith.addi %in, %49 : i32 | |
%51 = "tosa.apply_scale"(%50, %arg16, %arg17) {double_round = true} : (i32, i32, i8) -> i32 | |
%52 = arith.addi %51, %arg18 : i32 | |
%53 = arith.cmpi slt, %52, %arg3 : i32 | |
%54 = arith.select %53, %arg3, %52 : i32 | |
%55 = arith.cmpi sgt, %52, %arg14 : i32 | |
%56 = arith.select %55, %arg14, %54 : i32 | |
%57 = arith.trunci %56 : i32 to i8 | |
%58 = "tosa.apply_scale"(%47, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32 | |
%59 = arith.addi %58, %arg8 : i32 | |
%60 = arith.cmpi slt, %59, %arg3 : i32 | |
%61 = arith.select %60, %arg3, %59 : i32 | |
%62 = arith.cmpi sgt, %59, %arg14 : i32 | |
%63 = arith.select %62, %arg14, %61 : i32 | |
%64 = arith.trunci %63 : i32 to i8 | |
%65 = arith.extsi %64 : i8 to i32 | |
%66 = arith.subi %65, %arg8 : i32 | |
%67 = "tosa.apply_scale"(%66, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32 | |
%68 = arith.cmpi slt, %67, %arg6 : i32 | |
%69 = arith.select %68, %arg6, %67 : i32 | |
%70 = arith.cmpi sgt, %67, %arg7 : i32 | |
%71 = arith.select %70, %arg7, %69 : i32 | |
%72 = arith.extsi %57 : i8 to i32 | |
%73 = arith.subi %72, %arg18 : i32 | |
%74 = "tosa.apply_scale"(%73, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32 | |
%75 = arith.cmpi slt, %74, %arg6 : i32 | |
%76 = arith.select %75, %arg6, %74 : i32 | |
%77 = arith.cmpi sgt, %74, %arg7 : i32 | |
%78 = arith.select %77, %arg7, %76 : i32 | |
%79 = arith.addi %78, %71 : i32 | |
%80 = "tosa.apply_scale"(%79, %arg23, %arg20) {double_round = true} : (i32, i32, i8) -> i32 | |
%81 = arith.addi %80, %arg24 : i32 | |
%82 = arith.cmpi slt, %81, %arg3 : i32 | |
%83 = arith.select %82, %arg3, %81 : i32 | |
%84 = arith.cmpi sgt, %81, %arg14 : i32 | |
%85 = arith.select %84, %arg14, %83 : i32 | |
%86 = arith.trunci %85 : i32 to i8 | |
linalg.yield %86 : i8 | |
} -> tensor<384x128xi8> | |
flow.dispatch.tensor.store %11, %arg30, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>> | |
return | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
flow.executable private @main_dispatch_545 { | |
flow.executable.export public @main_dispatch_545_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_dispatch_545_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i8, %arg12: i32, %arg13: i8, %arg14: i32, %arg15: i32, %arg16: i8, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i8, %arg25: i32, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg28: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg31: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) { | |
%0 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xi8>> -> tensor<384x512xi8> | |
%1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xi8>> -> tensor<512x128xi8> | |
%2 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32> | |
%3 = flow.dispatch.tensor.load %arg27, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32> | |
%4 = flow.dispatch.tensor.load %arg28, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8> | |
%5 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8> | |
%6 = flow.dispatch.tensor.load %arg30, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8> | |
%7 = tensor.empty() : tensor<384x128xi8> | |
%8 = tensor.empty() : tensor<384x128xi32> | |
%9 = linalg.fill ins(%arg0 : i32) outs(%8 : tensor<384x128xi32>) -> tensor<384x128xi32> | |
%10 = linalg.matmul ins(%0, %1 : tensor<384x512xi8>, tensor<512x128xi8>) outs(%9 : tensor<384x128xi32>) -> tensor<384x128xi32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %10, %3, %4, %5, %6 : tensor<128xi32>, tensor<384x128xi32>, tensor<128xi32>, tensor<384x128xi8>, tensor<128xi8>, tensor<128xi8>) outs(%7 : tensor<384x128xi8>) { | |
^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i8, %in_3: i8, %in_4: i8, %out: i8): | |
%12 = arith.extsi %in_3 : i8 to i32 | |
%13 = arith.subi %12, %arg3 : i32 | |
%14 = "tosa.apply_scale"(%13, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32 | |
%15 = arith.cmpi slt, %14, %arg6 : i32 | |
%16 = arith.select %15, %arg6, %14 : i32 | |
%17 = arith.cmpi sgt, %14, %arg7 : i32 | |
%18 = arith.select %17, %arg7, %16 : i32 | |
%19 = arith.extsi %in_2 : i8 to i32 | |
%20 = arith.subi %19, %arg8 : i32 | |
%21 = "tosa.apply_scale"(%20, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32 | |
%22 = arith.cmpi slt, %21, %arg6 : i32 | |
%23 = arith.select %22, %arg6, %21 : i32 | |
%24 = arith.cmpi sgt, %21, %arg7 : i32 | |
%25 = arith.select %24, %arg7, %23 : i32 | |
%26 = arith.extsi %in_4 : i8 to i32 | |
%27 = arith.subi %26, %arg9 : i32 | |
%28 = "tosa.apply_scale"(%27, %arg10, %arg11) {double_round = false} : (i32, i32, i8) -> i32 | |
%29 = arith.cmpi slt, %28, %arg6 : i32 | |
%30 = arith.select %29, %arg6, %28 : i32 | |
%31 = arith.cmpi sgt, %28, %arg7 : i32 | |
%32 = arith.select %31, %arg7, %30 : i32 | |
%33 = arith.muli %25, %18 : i32 | |
%34 = "tosa.apply_scale"(%33, %arg12, %arg13) {double_round = true} : (i32, i32, i8) -> i32 | |
%35 = arith.addi %34, %arg14 : i32 | |
%36 = arith.cmpi slt, %35, %arg3 : i32 | |
%37 = arith.select %36, %arg3, %35 : i32 | |
%38 = arith.cmpi sgt, %35, %arg15 : i32 | |
%39 = arith.select %38, %arg15, %37 : i32 | |
%40 = arith.trunci %39 : i32 to i8 | |
%41 = arith.extsi %40 : i8 to i32 | |
%42 = arith.subi %41, %arg14 : i32 | |
%43 = "tosa.apply_scale"(%42, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32 | |
%44 = arith.cmpi slt, %43, %arg6 : i32 | |
%45 = arith.select %44, %arg6, %43 : i32 | |
%46 = arith.cmpi sgt, %43, %arg7 : i32 | |
%47 = arith.select %46, %arg7, %45 : i32 | |
%48 = arith.addi %47, %32 : i32 | |
%49 = arith.muli %in_1, %arg3 : i32 | |
%50 = arith.subi %in_0, %49 : i32 | |
%51 = arith.addi %in, %50 : i32 | |
%52 = "tosa.apply_scale"(%51, %arg17, %arg13) {double_round = true} : (i32, i32, i8) -> i32 | |
%53 = arith.addi %52, %arg18 : i32 | |
%54 = arith.cmpi slt, %53, %arg3 : i32 | |
%55 = arith.select %54, %arg3, %53 : i32 | |
%56 = arith.cmpi sgt, %53, %arg15 : i32 | |
%57 = arith.select %56, %arg15, %55 : i32 | |
%58 = arith.trunci %57 : i32 to i8 | |
%59 = "tosa.apply_scale"(%48, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32 | |
%60 = arith.addi %59, %arg8 : i32 | |
%61 = arith.cmpi slt, %60, %arg3 : i32 | |
%62 = arith.select %61, %arg3, %60 : i32 | |
%63 = arith.cmpi sgt, %60, %arg15 : i32 | |
%64 = arith.select %63, %arg15, %62 : i32 | |
%65 = arith.trunci %64 : i32 to i8 | |
%66 = arith.extsi %65 : i8 to i32 | |
%67 = arith.subi %66, %arg8 : i32 | |
%68 = "tosa.apply_scale"(%67, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32 | |
%69 = arith.cmpi slt, %68, %arg6 : i32 | |
%70 = arith.select %69, %arg6, %68 : i32 | |
%71 = arith.cmpi sgt, %68, %arg7 : i32 | |
%72 = arith.select %71, %arg7, %70 : i32 | |
%73 = arith.extsi %58 : i8 to i32 | |
%74 = arith.subi %73, %arg18 : i32 | |
%75 = "tosa.apply_scale"(%74, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32 | |
%76 = arith.cmpi slt, %75, %arg6 : i32 | |
%77 = arith.select %76, %arg6, %75 : i32 | |
%78 = arith.cmpi sgt, %75, %arg7 : i32 | |
%79 = arith.select %78, %arg7, %77 : i32 | |
%80 = arith.addi %79, %72 : i32 | |
%81 = "tosa.apply_scale"(%80, %arg23, %arg24) {double_round = true} : (i32, i32, i8) -> i32 | |
%82 = arith.addi %81, %arg25 : i32 | |
%83 = arith.cmpi slt, %82, %arg3 : i32 | |
%84 = arith.select %83, %arg3, %82 : i32 | |
%85 = arith.cmpi sgt, %82, %arg15 : i32 | |
%86 = arith.select %85, %arg15, %84 : i32 | |
%87 = arith.trunci %86 : i32 to i8 | |
linalg.yield %87 : i8 | |
} -> tensor<384x128xi8> | |
flow.dispatch.tensor.store %11, %arg31, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>> | |
return | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment