Skip to content

Instantly share code, notes, and snippets.

@ScottTodd
Created February 17, 2023 21:01
Show Gist options
  • Save ScottTodd/a64c217dd9aa5b45da3028b441f70f88 to your computer and use it in GitHub Desktop.
Save ScottTodd/a64c217dd9aa5b45da3028b441f70f88 to your computer and use it in GitHub Desktop.
mobilebert-baseline-tf2-quant_noinlining_flow diffs 2
diff --git a/dispatch_537.mlir b/dispatch_545.mlir
index 43efe8e..244c54c 100644
--- a/dispatch_537.mlir
+++ b/dispatch_545.mlir
@@ -1,17 +1,17 @@
-  flow.executable private @main_dispatch_537 {
-    flow.executable.export public @main_dispatch_537_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
+  flow.executable private @main_dispatch_545 {
+    flow.executable.export public @main_dispatch_545_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
       %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
       flow.return %x, %y, %z : index, index, index
     }
     builtin.module {
-      func.func @main_dispatch_537_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i8, %arg11: i32, %arg12: i8, %arg13: i32, %arg14: i32, %arg15: i8, %arg16: i32, %arg17: i8, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i32, %arg25: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg28: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) {
+      func.func @main_dispatch_545_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i8, %arg12: i32, %arg13: i8, %arg14: i32, %arg15: i32, %arg16: i8, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i8, %arg25: i32, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg28: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg31: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) {
         %0 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xi8>> -> tensor<384x512xi8>
         %1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xi8>> -> tensor<512x128xi8>
-        %2 = flow.dispatch.tensor.load %arg25, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
-        %3 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
-        %4 = flow.dispatch.tensor.load %arg27, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8>
-        %5 = flow.dispatch.tensor.load %arg28, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
-        %6 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
+        %2 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
+        %3 = flow.dispatch.tensor.load %arg27, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
+        %4 = flow.dispatch.tensor.load %arg28, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8>
+        %5 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
+        %6 = flow.dispatch.tensor.load %arg30, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
         %7 = tensor.empty() : tensor<384x128xi8>
         %8 = tensor.empty() : tensor<384x128xi32>
         %9 = linalg.fill ins(%arg0 : i32) outs(%8 : tensor<384x128xi32>) -> tensor<384x128xi32>
@@ -26,76 +26,77 @@
           %17 = arith.cmpi sgt, %14, %arg7 : i32
           %18 = arith.select %17, %arg7, %16 : i32
           %19 = arith.extsi %in_2 : i8 to i32
-          %20 = "tosa.apply_scale"(%19, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
-          %21 = arith.cmpi slt, %20, %arg6 : i32
-          %22 = arith.select %21, %arg6, %20 : i32
-          %23 = arith.cmpi sgt, %20, %arg7 : i32
-          %24 = arith.select %23, %arg7, %22 : i32
-          %25 = arith.extsi %in_4 : i8 to i32
-          %26 = arith.subi %25, %arg8 : i32
-          %27 = "tosa.apply_scale"(%26, %arg9, %arg10) {double_round = false} : (i32, i32, i8) -> i32
-          %28 = arith.cmpi slt, %27, %arg6 : i32
-          %29 = arith.select %28, %arg6, %27 : i32
-          %30 = arith.cmpi sgt, %27, %arg7 : i32
-          %31 = arith.select %30, %arg7, %29 : i32
-          %32 = arith.muli %24, %18 : i32
-          %33 = "tosa.apply_scale"(%32, %arg11, %arg12) {double_round = true} : (i32, i32, i8) -> i32
-          %34 = arith.addi %33, %arg13 : i32
-          %35 = arith.cmpi slt, %34, %arg3 : i32
-          %36 = arith.select %35, %arg3, %34 : i32
-          %37 = arith.cmpi sgt, %34, %arg14 : i32
-          %38 = arith.select %37, %arg14, %36 : i32
-          %39 = arith.trunci %38 : i32 to i8
-          %40 = arith.extsi %39 : i8 to i32
-          %41 = arith.subi %40, %arg13 : i32
-          %42 = "tosa.apply_scale"(%41, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32
-          %43 = arith.cmpi slt, %42, %arg6 : i32
-          %44 = arith.select %43, %arg6, %42 : i32
-          %45 = arith.cmpi sgt, %42, %arg7 : i32
-          %46 = arith.select %45, %arg7, %44 : i32
-          %47 = arith.addi %46, %31 : i32
-          %48 = arith.muli %in_1, %arg3 : i32
-          %49 = arith.subi %in_0, %48 : i32
-          %50 = arith.addi %in, %49 : i32
-          %51 = "tosa.apply_scale"(%50, %arg16, %arg17) {double_round = true} : (i32, i32, i8) -> i32
-          %52 = arith.addi %51, %arg18 : i32
-          %53 = arith.cmpi slt, %52, %arg3 : i32
-          %54 = arith.select %53, %arg3, %52 : i32
-          %55 = arith.cmpi sgt, %52, %arg14 : i32
-          %56 = arith.select %55, %arg14, %54 : i32
-          %57 = arith.trunci %56 : i32 to i8
-          %58 = "tosa.apply_scale"(%47, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32
-          %59 = arith.addi %58, %arg8 : i32
-          %60 = arith.cmpi slt, %59, %arg3 : i32
-          %61 = arith.select %60, %arg3, %59 : i32
-          %62 = arith.cmpi sgt, %59, %arg14 : i32
-          %63 = arith.select %62, %arg14, %61 : i32
-          %64 = arith.trunci %63 : i32 to i8
-          %65 = arith.extsi %64 : i8 to i32
-          %66 = arith.subi %65, %arg8 : i32
-          %67 = "tosa.apply_scale"(%66, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32
-          %68 = arith.cmpi slt, %67, %arg6 : i32
-          %69 = arith.select %68, %arg6, %67 : i32
-          %70 = arith.cmpi sgt, %67, %arg7 : i32
-          %71 = arith.select %70, %arg7, %69 : i32
-          %72 = arith.extsi %57 : i8 to i32
-          %73 = arith.subi %72, %arg18 : i32
-          %74 = "tosa.apply_scale"(%73, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32
-          %75 = arith.cmpi slt, %74, %arg6 : i32
-          %76 = arith.select %75, %arg6, %74 : i32
-          %77 = arith.cmpi sgt, %74, %arg7 : i32
-          %78 = arith.select %77, %arg7, %76 : i32
-          %79 = arith.addi %78, %71 : i32
-          %80 = "tosa.apply_scale"(%79, %arg23, %arg20) {double_round = true} : (i32, i32, i8) -> i32
-          %81 = arith.addi %80, %arg24 : i32
-          %82 = arith.cmpi slt, %81, %arg3 : i32
-          %83 = arith.select %82, %arg3, %81 : i32
-          %84 = arith.cmpi sgt, %81, %arg14 : i32
-          %85 = arith.select %84, %arg14, %83 : i32
-          %86 = arith.trunci %85 : i32 to i8
-          linalg.yield %86 : i8
+          %20 = arith.subi %19, %arg8 : i32
+          %21 = "tosa.apply_scale"(%20, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
+          %22 = arith.cmpi slt, %21, %arg6 : i32
+          %23 = arith.select %22, %arg6, %21 : i32
+          %24 = arith.cmpi sgt, %21, %arg7 : i32
+          %25 = arith.select %24, %arg7, %23 : i32
+          %26 = arith.extsi %in_4 : i8 to i32
+          %27 = arith.subi %26, %arg9 : i32
+          %28 = "tosa.apply_scale"(%27, %arg10, %arg11) {double_round = false} : (i32, i32, i8) -> i32
+          %29 = arith.cmpi slt, %28, %arg6 : i32
+          %30 = arith.select %29, %arg6, %28 : i32
+          %31 = arith.cmpi sgt, %28, %arg7 : i32
+          %32 = arith.select %31, %arg7, %30 : i32
+          %33 = arith.muli %25, %18 : i32
+          %34 = "tosa.apply_scale"(%33, %arg12, %arg13) {double_round = true} : (i32, i32, i8) -> i32
+          %35 = arith.addi %34, %arg14 : i32
+          %36 = arith.cmpi slt, %35, %arg3 : i32
+          %37 = arith.select %36, %arg3, %35 : i32
+          %38 = arith.cmpi sgt, %35, %arg15 : i32
+          %39 = arith.select %38, %arg15, %37 : i32
+          %40 = arith.trunci %39 : i32 to i8
+          %41 = arith.extsi %40 : i8 to i32
+          %42 = arith.subi %41, %arg14 : i32
+          %43 = "tosa.apply_scale"(%42, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32
+          %44 = arith.cmpi slt, %43, %arg6 : i32
+          %45 = arith.select %44, %arg6, %43 : i32
+          %46 = arith.cmpi sgt, %43, %arg7 : i32
+          %47 = arith.select %46, %arg7, %45 : i32
+          %48 = arith.addi %47, %32 : i32
+          %49 = arith.muli %in_1, %arg3 : i32
+          %50 = arith.subi %in_0, %49 : i32
+          %51 = arith.addi %in, %50 : i32
+          %52 = "tosa.apply_scale"(%51, %arg17, %arg13) {double_round = true} : (i32, i32, i8) -> i32
+          %53 = arith.addi %52, %arg18 : i32
+          %54 = arith.cmpi slt, %53, %arg3 : i32
+          %55 = arith.select %54, %arg3, %53 : i32
+          %56 = arith.cmpi sgt, %53, %arg15 : i32
+          %57 = arith.select %56, %arg15, %55 : i32
+          %58 = arith.trunci %57 : i32 to i8
+          %59 = "tosa.apply_scale"(%48, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32
+          %60 = arith.addi %59, %arg8 : i32
+          %61 = arith.cmpi slt, %60, %arg3 : i32
+          %62 = arith.select %61, %arg3, %60 : i32
+          %63 = arith.cmpi sgt, %60, %arg15 : i32
+          %64 = arith.select %63, %arg15, %62 : i32
+          %65 = arith.trunci %64 : i32 to i8
+          %66 = arith.extsi %65 : i8 to i32
+          %67 = arith.subi %66, %arg8 : i32
+          %68 = "tosa.apply_scale"(%67, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32
+          %69 = arith.cmpi slt, %68, %arg6 : i32
+          %70 = arith.select %69, %arg6, %68 : i32
+          %71 = arith.cmpi sgt, %68, %arg7 : i32
+          %72 = arith.select %71, %arg7, %70 : i32
+          %73 = arith.extsi %58 : i8 to i32
+          %74 = arith.subi %73, %arg18 : i32
+          %75 = "tosa.apply_scale"(%74, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32
+          %76 = arith.cmpi slt, %75, %arg6 : i32
+          %77 = arith.select %76, %arg6, %75 : i32
+          %78 = arith.cmpi sgt, %75, %arg7 : i32
+          %79 = arith.select %78, %arg7, %77 : i32
+          %80 = arith.addi %79, %72 : i32
+          %81 = "tosa.apply_scale"(%80, %arg23, %arg24) {double_round = true} : (i32, i32, i8) -> i32
+          %82 = arith.addi %81, %arg25 : i32
+          %83 = arith.cmpi slt, %82, %arg3 : i32
+          %84 = arith.select %83, %arg3, %82 : i32
+          %85 = arith.cmpi sgt, %82, %arg15 : i32
+          %86 = arith.select %85, %arg15, %84 : i32
+          %87 = arith.trunci %86 : i32 to i8
+          linalg.yield %87 : i8
         } -> tensor<384x128xi8>
-        flow.dispatch.tensor.store %11, %arg30, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>
+        flow.dispatch.tensor.store %11, %arg31, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>
         return
       }
     }
flow.executable private @main_dispatch_537 {
flow.executable.export public @main_dispatch_537_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_dispatch_537_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i8, %arg11: i32, %arg12: i8, %arg13: i32, %arg14: i32, %arg15: i8, %arg16: i32, %arg17: i8, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i32, %arg25: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg28: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) {
%0 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xi8>> -> tensor<384x512xi8>
%1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xi8>> -> tensor<512x128xi8>
%2 = flow.dispatch.tensor.load %arg25, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
%3 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
%4 = flow.dispatch.tensor.load %arg27, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8>
%5 = flow.dispatch.tensor.load %arg28, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
%6 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
%7 = tensor.empty() : tensor<384x128xi8>
%8 = tensor.empty() : tensor<384x128xi32>
%9 = linalg.fill ins(%arg0 : i32) outs(%8 : tensor<384x128xi32>) -> tensor<384x128xi32>
%10 = linalg.matmul ins(%0, %1 : tensor<384x512xi8>, tensor<512x128xi8>) outs(%9 : tensor<384x128xi32>) -> tensor<384x128xi32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %10, %3, %4, %5, %6 : tensor<128xi32>, tensor<384x128xi32>, tensor<128xi32>, tensor<384x128xi8>, tensor<128xi8>, tensor<128xi8>) outs(%7 : tensor<384x128xi8>) {
^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i8, %in_3: i8, %in_4: i8, %out: i8):
%12 = arith.extsi %in_3 : i8 to i32
%13 = arith.subi %12, %arg3 : i32
%14 = "tosa.apply_scale"(%13, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
%15 = arith.cmpi slt, %14, %arg6 : i32
%16 = arith.select %15, %arg6, %14 : i32
%17 = arith.cmpi sgt, %14, %arg7 : i32
%18 = arith.select %17, %arg7, %16 : i32
%19 = arith.extsi %in_2 : i8 to i32
%20 = "tosa.apply_scale"(%19, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
%21 = arith.cmpi slt, %20, %arg6 : i32
%22 = arith.select %21, %arg6, %20 : i32
%23 = arith.cmpi sgt, %20, %arg7 : i32
%24 = arith.select %23, %arg7, %22 : i32
%25 = arith.extsi %in_4 : i8 to i32
%26 = arith.subi %25, %arg8 : i32
%27 = "tosa.apply_scale"(%26, %arg9, %arg10) {double_round = false} : (i32, i32, i8) -> i32
%28 = arith.cmpi slt, %27, %arg6 : i32
%29 = arith.select %28, %arg6, %27 : i32
%30 = arith.cmpi sgt, %27, %arg7 : i32
%31 = arith.select %30, %arg7, %29 : i32
%32 = arith.muli %24, %18 : i32
%33 = "tosa.apply_scale"(%32, %arg11, %arg12) {double_round = true} : (i32, i32, i8) -> i32
%34 = arith.addi %33, %arg13 : i32
%35 = arith.cmpi slt, %34, %arg3 : i32
%36 = arith.select %35, %arg3, %34 : i32
%37 = arith.cmpi sgt, %34, %arg14 : i32
%38 = arith.select %37, %arg14, %36 : i32
%39 = arith.trunci %38 : i32 to i8
%40 = arith.extsi %39 : i8 to i32
%41 = arith.subi %40, %arg13 : i32
%42 = "tosa.apply_scale"(%41, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32
%43 = arith.cmpi slt, %42, %arg6 : i32
%44 = arith.select %43, %arg6, %42 : i32
%45 = arith.cmpi sgt, %42, %arg7 : i32
%46 = arith.select %45, %arg7, %44 : i32
%47 = arith.addi %46, %31 : i32
%48 = arith.muli %in_1, %arg3 : i32
%49 = arith.subi %in_0, %48 : i32
%50 = arith.addi %in, %49 : i32
%51 = "tosa.apply_scale"(%50, %arg16, %arg17) {double_round = true} : (i32, i32, i8) -> i32
%52 = arith.addi %51, %arg18 : i32
%53 = arith.cmpi slt, %52, %arg3 : i32
%54 = arith.select %53, %arg3, %52 : i32
%55 = arith.cmpi sgt, %52, %arg14 : i32
%56 = arith.select %55, %arg14, %54 : i32
%57 = arith.trunci %56 : i32 to i8
%58 = "tosa.apply_scale"(%47, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32
%59 = arith.addi %58, %arg8 : i32
%60 = arith.cmpi slt, %59, %arg3 : i32
%61 = arith.select %60, %arg3, %59 : i32
%62 = arith.cmpi sgt, %59, %arg14 : i32
%63 = arith.select %62, %arg14, %61 : i32
%64 = arith.trunci %63 : i32 to i8
%65 = arith.extsi %64 : i8 to i32
%66 = arith.subi %65, %arg8 : i32
%67 = "tosa.apply_scale"(%66, %arg4, %arg15) {double_round = false} : (i32, i32, i8) -> i32
%68 = arith.cmpi slt, %67, %arg6 : i32
%69 = arith.select %68, %arg6, %67 : i32
%70 = arith.cmpi sgt, %67, %arg7 : i32
%71 = arith.select %70, %arg7, %69 : i32
%72 = arith.extsi %57 : i8 to i32
%73 = arith.subi %72, %arg18 : i32
%74 = "tosa.apply_scale"(%73, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32
%75 = arith.cmpi slt, %74, %arg6 : i32
%76 = arith.select %75, %arg6, %74 : i32
%77 = arith.cmpi sgt, %74, %arg7 : i32
%78 = arith.select %77, %arg7, %76 : i32
%79 = arith.addi %78, %71 : i32
%80 = "tosa.apply_scale"(%79, %arg23, %arg20) {double_round = true} : (i32, i32, i8) -> i32
%81 = arith.addi %80, %arg24 : i32
%82 = arith.cmpi slt, %81, %arg3 : i32
%83 = arith.select %82, %arg3, %81 : i32
%84 = arith.cmpi sgt, %81, %arg14 : i32
%85 = arith.select %84, %arg14, %83 : i32
%86 = arith.trunci %85 : i32 to i8
linalg.yield %86 : i8
} -> tensor<384x128xi8>
flow.dispatch.tensor.store %11, %arg30, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>
return
}
}
}
flow.executable private @main_dispatch_545 {
flow.executable.export public @main_dispatch_545_matmul_384x128x512 workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0, %arg1
flow.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_dispatch_545_matmul_384x128x512(%arg0: i32, %arg1: !flow.dispatch.tensor<readonly:tensor<384x512xi8>>, %arg2: !flow.dispatch.tensor<readonly:tensor<512x128xi8>>, %arg3: i32, %arg4: i32, %arg5: i8, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i8, %arg12: i32, %arg13: i8, %arg14: i32, %arg15: i32, %arg16: i8, %arg17: i32, %arg18: i32, %arg19: i32, %arg20: i8, %arg21: i32, %arg22: i8, %arg23: i32, %arg24: i8, %arg25: i32, %arg26: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg27: !flow.dispatch.tensor<readonly:tensor<128xi32>>, %arg28: !flow.dispatch.tensor<readonly:tensor<384x128xi8>>, %arg29: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg30: !flow.dispatch.tensor<readonly:tensor<128xi8>>, %arg31: !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>) {
%0 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xi8>> -> tensor<384x512xi8>
%1 = flow.dispatch.tensor.load %arg2, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xi8>> -> tensor<512x128xi8>
%2 = flow.dispatch.tensor.load %arg26, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
%3 = flow.dispatch.tensor.load %arg27, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi32>> -> tensor<128xi32>
%4 = flow.dispatch.tensor.load %arg28, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x128xi8>> -> tensor<384x128xi8>
%5 = flow.dispatch.tensor.load %arg29, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
%6 = flow.dispatch.tensor.load %arg30, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xi8>> -> tensor<128xi8>
%7 = tensor.empty() : tensor<384x128xi8>
%8 = tensor.empty() : tensor<384x128xi32>
%9 = linalg.fill ins(%arg0 : i32) outs(%8 : tensor<384x128xi32>) -> tensor<384x128xi32>
%10 = linalg.matmul ins(%0, %1 : tensor<384x512xi8>, tensor<512x128xi8>) outs(%9 : tensor<384x128xi32>) -> tensor<384x128xi32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %10, %3, %4, %5, %6 : tensor<128xi32>, tensor<384x128xi32>, tensor<128xi32>, tensor<384x128xi8>, tensor<128xi8>, tensor<128xi8>) outs(%7 : tensor<384x128xi8>) {
^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i8, %in_3: i8, %in_4: i8, %out: i8):
%12 = arith.extsi %in_3 : i8 to i32
%13 = arith.subi %12, %arg3 : i32
%14 = "tosa.apply_scale"(%13, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
%15 = arith.cmpi slt, %14, %arg6 : i32
%16 = arith.select %15, %arg6, %14 : i32
%17 = arith.cmpi sgt, %14, %arg7 : i32
%18 = arith.select %17, %arg7, %16 : i32
%19 = arith.extsi %in_2 : i8 to i32
%20 = arith.subi %19, %arg8 : i32
%21 = "tosa.apply_scale"(%20, %arg4, %arg5) {double_round = false} : (i32, i32, i8) -> i32
%22 = arith.cmpi slt, %21, %arg6 : i32
%23 = arith.select %22, %arg6, %21 : i32
%24 = arith.cmpi sgt, %21, %arg7 : i32
%25 = arith.select %24, %arg7, %23 : i32
%26 = arith.extsi %in_4 : i8 to i32
%27 = arith.subi %26, %arg9 : i32
%28 = "tosa.apply_scale"(%27, %arg10, %arg11) {double_round = false} : (i32, i32, i8) -> i32
%29 = arith.cmpi slt, %28, %arg6 : i32
%30 = arith.select %29, %arg6, %28 : i32
%31 = arith.cmpi sgt, %28, %arg7 : i32
%32 = arith.select %31, %arg7, %30 : i32
%33 = arith.muli %25, %18 : i32
%34 = "tosa.apply_scale"(%33, %arg12, %arg13) {double_round = true} : (i32, i32, i8) -> i32
%35 = arith.addi %34, %arg14 : i32
%36 = arith.cmpi slt, %35, %arg3 : i32
%37 = arith.select %36, %arg3, %35 : i32
%38 = arith.cmpi sgt, %35, %arg15 : i32
%39 = arith.select %38, %arg15, %37 : i32
%40 = arith.trunci %39 : i32 to i8
%41 = arith.extsi %40 : i8 to i32
%42 = arith.subi %41, %arg14 : i32
%43 = "tosa.apply_scale"(%42, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32
%44 = arith.cmpi slt, %43, %arg6 : i32
%45 = arith.select %44, %arg6, %43 : i32
%46 = arith.cmpi sgt, %43, %arg7 : i32
%47 = arith.select %46, %arg7, %45 : i32
%48 = arith.addi %47, %32 : i32
%49 = arith.muli %in_1, %arg3 : i32
%50 = arith.subi %in_0, %49 : i32
%51 = arith.addi %in, %50 : i32
%52 = "tosa.apply_scale"(%51, %arg17, %arg13) {double_round = true} : (i32, i32, i8) -> i32
%53 = arith.addi %52, %arg18 : i32
%54 = arith.cmpi slt, %53, %arg3 : i32
%55 = arith.select %54, %arg3, %53 : i32
%56 = arith.cmpi sgt, %53, %arg15 : i32
%57 = arith.select %56, %arg15, %55 : i32
%58 = arith.trunci %57 : i32 to i8
%59 = "tosa.apply_scale"(%48, %arg19, %arg20) {double_round = true} : (i32, i32, i8) -> i32
%60 = arith.addi %59, %arg8 : i32
%61 = arith.cmpi slt, %60, %arg3 : i32
%62 = arith.select %61, %arg3, %60 : i32
%63 = arith.cmpi sgt, %60, %arg15 : i32
%64 = arith.select %63, %arg15, %62 : i32
%65 = arith.trunci %64 : i32 to i8
%66 = arith.extsi %65 : i8 to i32
%67 = arith.subi %66, %arg8 : i32
%68 = "tosa.apply_scale"(%67, %arg4, %arg16) {double_round = false} : (i32, i32, i8) -> i32
%69 = arith.cmpi slt, %68, %arg6 : i32
%70 = arith.select %69, %arg6, %68 : i32
%71 = arith.cmpi sgt, %68, %arg7 : i32
%72 = arith.select %71, %arg7, %70 : i32
%73 = arith.extsi %58 : i8 to i32
%74 = arith.subi %73, %arg18 : i32
%75 = "tosa.apply_scale"(%74, %arg21, %arg22) {double_round = false} : (i32, i32, i8) -> i32
%76 = arith.cmpi slt, %75, %arg6 : i32
%77 = arith.select %76, %arg6, %75 : i32
%78 = arith.cmpi sgt, %75, %arg7 : i32
%79 = arith.select %78, %arg7, %77 : i32
%80 = arith.addi %79, %72 : i32
%81 = "tosa.apply_scale"(%80, %arg23, %arg24) {double_round = true} : (i32, i32, i8) -> i32
%82 = arith.addi %81, %arg25 : i32
%83 = arith.cmpi slt, %82, %arg3 : i32
%84 = arith.select %83, %arg3, %82 : i32
%85 = arith.cmpi sgt, %82, %arg15 : i32
%86 = arith.select %85, %arg15, %84 : i32
%87 = arith.trunci %86 : i32 to i8
linalg.yield %87 : i8
} -> tensor<384x128xi8>
flow.dispatch.tensor.store %11, %arg31, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xi8> -> !flow.dispatch.tensor<writeonly:tensor<384x128xi8>>
return
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment