hanhanW/zz.mlir Secret

## zz.mlir
// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1) -> (d0)>
#map5 = affine_map<(d0, d1) -> (d1)>
#map6 = affine_map<(d0, d1) -> ()>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.func private @check_one_quantized_matmul_as_matmul_3x4x5(%arg0: tensor<3x4xi8>, %arg1: tensor<4x5xi8>, %arg2: i32, %arg3: i32) {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<3x5xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<3x5xi32>) -> tensor<3x5xi32>
    %2 = linalg.quantized_matmul ins(%arg0, %arg1, %arg2, %arg3 : tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) outs(%1 : tensor<3x5xi32>) -> tensor<3x5xi32>
    %3 = iree_encoding.set_encoding %arg0 : tensor<3x4xi8> -> tensor<3x4xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<3x4xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %4 = iree_encoding.set_encoding %arg1 : tensor<4x5xi8> -> tensor<4x5xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<4x5xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %5 = tensor.empty() : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %7 = linalg.matmul ins(%3, %4 : tensor<3x4xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<3x4xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>, tensor<4x5xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<4x5xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) outs(%6 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %8 = iree_encoding.unset_encoding %7 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>> -> tensor<3x5xi32>
    %9 = tensor.empty() : tensor<3x4xi32>
    %10 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<3x4xi8>) outs(%9 : tensor<3x4xi32>) {
    ^bb0(%in: i8, %out: i32):
      %20 = arith.extsi %in : i8 to i32
      linalg.yield %20 : i32
    } -> tensor<3x4xi32>
    %11 = tensor.empty() : tensor<3xi32>
    %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<3xi32>) -> tensor<3xi32>
    %13 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<3x4xi32>) outs(%12 : tensor<3xi32>) {
    ^bb0(%in: i32, %out: i32):
      %20 = arith.addi %in, %out : i32
      linalg.yield %20 : i32
    } -> tensor<3xi32>
    %14 = tensor.empty() : tensor<4x5xi32>
    %15 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<4x5xi8>) outs(%14 : tensor<4x5xi32>) {
    ^bb0(%in: i8, %out: i32):
      %20 = arith.extsi %in : i8 to i32
      linalg.yield %20 : i32
    } -> tensor<4x5xi32>
    %16 = tensor.empty() : tensor<5xi32>
    %17 = linalg.fill ins(%c0_i32 : i32) outs(%16 : tensor<5xi32>) -> tensor<5xi32>
    %18 = linalg.generic {indexing_maps = [#map3, #map5], iterator_types = ["reduction", "parallel"]} ins(%15 : tensor<4x5xi32>) outs(%17 : tensor<5xi32>) {
    ^bb0(%in: i32, %out: i32):
      %20 = arith.addi %in, %out : i32
      linalg.yield %20 : i32
    } -> tensor<5xi32>
    %19 = linalg.generic {indexing_maps = [#map3, #map4, #map5, #map6, #map6, #map6, #map3], iterator_types = ["parallel", "parallel"]} ins(%8, %13, %18, %arg2, %arg3, %c4_i32 : tensor<3x5xi32>, tensor<3xi32>, tensor<5xi32>, i32, i32, i32) outs(%0 : tensor<3x5xi32>) {
    ^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i32, %in_3: i32, %in_4: i32, %out: i32):
      %20 = arith.muli %in_0, %in_3 : i32
      %21 = arith.muli %in_1, %in_2 : i32
      %22 = arith.addi %20, %21 : i32
      %23 = arith.muli %in_2, %in_3 : i32
      %24 = arith.muli %in_4, %23 : i32
      %25 = arith.subi %in, %22 : i32
      %26 = arith.addi %25, %24 : i32
      linalg.yield %26 : i32
    } -> tensor<3x5xi32>
    check.expect_eq(%2, %19) : tensor<3x5xi32>
    util.return
  }
  util.func private @check_one_quantized_matmul_as_matmul_dynamic(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>) {
    %c3 = arith.constant 3 : index
    %c4 = arith.constant 4 : index
    %c4_0 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c-128_i32 = arith.constant -128 : i32
    %c127_i32 = arith.constant 127 : i32
    %c0_i32 = arith.constant 0 : i32
    %0 = flow.tensor.tie_shape %arg0 : tensor<?x?xi8>{%c3, %c4}
    %1 = flow.tensor.tie_shape %arg1 : tensor<?x?xi8>{%c4_0, %c5}
    %2 = tensor.empty(%c3, %c5) : tensor<?x?xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %4 = linalg.quantized_matmul ins(%0, %1, %c-128_i32, %c127_i32 : tensor<?x?xi8>, tensor<?x?xi8>, i32, i32) outs(%3 : tensor<?x?xi32>) -> tensor<?x?xi32>
    %5 = arith.index_cast %c4 : index to i32
    %6 = iree_encoding.set_encoding %0 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %7 = iree_encoding.set_encoding %1 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %8 = tensor.empty(%c3, %c5) : tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %10 = linalg.matmul ins(%6, %7 : tensor<?x?xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>, tensor<?x?xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) outs(%9 : tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %11 = iree_encoding.unset_encoding %10 : tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>> -> tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %11[0, 0] [%c3, %c5] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
    %12 = tensor.empty(%c3, %c4) : tensor<?x?xi32>
    %13 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<?x?xi8>) outs(%12 : tensor<?x?xi32>) {
    ^bb0(%in: i8, %out: i32):
      %23 = arith.extsi %in : i8 to i32
      linalg.yield %23 : i32
    } -> tensor<?x?xi32>
    %14 = tensor.empty(%c3) : tensor<?xi32>
    %15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<?xi32>) -> tensor<?xi32>
    %16 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "reduction"]} ins(%13 : tensor<?x?xi32>) outs(%15 : tensor<?xi32>) {
    ^bb0(%in: i32, %out: i32):
      %23 = arith.addi %in, %out : i32
      linalg.yield %23 : i32
    } -> tensor<?xi32>
    %17 = tensor.empty(%c4_0, %c5) : tensor<?x?xi32>
    %18 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<?x?xi8>) outs(%17 : tensor<?x?xi32>) {
    ^bb0(%in: i8, %out: i32):
      %23 = arith.extsi %in : i8 to i32
      linalg.yield %23 : i32
    } -> tensor<?x?xi32>
    %19 = tensor.empty(%c5) : tensor<?xi32>
    %20 = linalg.fill ins(%c0_i32 : i32) outs(%19 : tensor<?xi32>) -> tensor<?xi32>
    %21 = linalg.generic {indexing_maps = [#map3, #map5], iterator_types = ["reduction", "parallel"]} ins(%18 : tensor<?x?xi32>) outs(%20 : tensor<?xi32>) {
    ^bb0(%in: i32, %out: i32):
      %23 = arith.addi %in, %out : i32
      linalg.yield %23 : i32
    } -> tensor<?xi32>
    %22 = linalg.generic {indexing_maps = [#map3, #map4, #map5, #map6, #map6, #map6, #map3], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %16, %21, %c-128_i32, %c127_i32, %5 : tensor<?x?xi32>, tensor<?xi32>, tensor<?xi32>, i32, i32, i32) outs(%2 : tensor<?x?xi32>) {
    ^bb0(%in: i32, %in_1: i32, %in_2: i32, %in_3: i32, %in_4: i32, %in_5: i32, %out: i32):
      %23 = arith.muli %in_1, %in_4 : i32
      %24 = arith.muli %in_2, %in_3 : i32
      %25 = arith.addi %23, %24 : i32
      %26 = arith.muli %in_3, %in_4 : i32
      %27 = arith.muli %in_5, %26 : i32
      %28 = arith.subi %in, %25 : i32
      %29 = arith.addi %28, %27 : i32
      linalg.yield %29 : i32
    } -> tensor<?x?xi32>
    check.expect_eq(%4, %22) : tensor<?x?xi32>
    util.return
  }
  util.func public @test_quantized_matmul_as_matmul() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @test_quantized_matmul_as_matmul() -> ()"}} {
    %c5 = arith.constant 5 : index
    %c4 = arith.constant 4 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant dense<[[123, -125, 127, -128, 91], [-70, 37, 0, -40, 57], [-128, 127, -121, -100, 99], [127, 105, 83, 51, -128]]> : tensor<4x5xi8>
    %cst_0 = arith.constant dense<[[127, -128, 0, 51], [-47, 101, -119, 0], [-128, 89, -63, 127]]> : tensor<3x4xi8>
    %cst_1 = arith.constant dense<[[5, 4, 3, 2, 9], [1, 0, -1, -2, 8], [-3, -4, -5, -6, 7], [2, 3, 5, 7, 11]]> : tensor<4x5xi8>
    %c127_i32 = arith.constant 127 : i32
    %c-128_i32 = arith.constant -128 : i32
    %c-57_i32 = arith.constant -57 : i32
    %c41_i32 = arith.constant 41 : i32
    %c3_i32 = arith.constant 3 : i32
    %c-2_i32 = arith.constant -2 : i32
    %c0_i32 = arith.constant 0 : i32
    %cst_2 = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]> : tensor<3x4xi8>
    %0 = util.optimization_barrier %cst_2 : tensor<3x4xi8>
    %1 = util.optimization_barrier %cst_1 : tensor<4x5xi8>
    %2 = util.optimization_barrier %cst_0 : tensor<3x4xi8>
    %3 = util.optimization_barrier %cst : tensor<4x5xi8>
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c0_i32, %c0_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c0_i32, %c3_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c-2_i32, %c0_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c-2_i32, %c3_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%2, %3, %c41_i32, %c-57_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%2, %3, %c-128_i32, %c127_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    %cast = tensor.cast %0 : tensor<3x4xi8> to tensor<?x?xi8>
    %cast_3 = tensor.cast %1 : tensor<4x5xi8> to tensor<?x?xi8>
    util.call @check_one_quantized_matmul_as_matmul_dynamic(%cast, %cast_3) : (tensor<?x?xi8>, tensor<?x?xi8>) -> ()
    util.return
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<(d0, d1) -> (d0, d1)>
#map4 = affine_map<(d0, d1) -> (d0)>
#map5 = affine_map<(d0, d1) -> (d1)>
#map6 = affine_map<(d0, d1) -> ()>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.func private @check_one_quantized_matmul_as_matmul_3x4x5(%arg0: tensor<3x4xi8>, %arg1: tensor<4x5xi8>, %arg2: i32, %arg3: i32) {
    %c4_i32 = arith.constant 4 : i32
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<3x5xi32>
    %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<3x5xi32>) -> tensor<3x5xi32>
    %2 = linalg.quantized_matmul ins(%arg0, %arg1, %arg2, %arg3 : tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) outs(%1 : tensor<3x5xi32>) -> tensor<3x5xi32>
    %3 = iree_encoding.set_encoding %arg0 : tensor<3x4xi8> -> tensor<3x4xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<3x4xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %4 = iree_encoding.set_encoding %arg1 : tensor<4x5xi8> -> tensor<4x5xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<4x5xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %5 = tensor.empty() : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %7 = linalg.matmul ins(%3, %4 : tensor<3x4xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<3x4xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>, tensor<4x5xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<4x5xi8>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) outs(%6 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %8 = iree_encoding.unset_encoding %7 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<3x5xi32>, matmul_narrow_M = 3 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>> -> tensor<3x5xi32>
    %9 = tensor.empty() : tensor<3x4xi32>
    %10 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<3x4xi8>) outs(%9 : tensor<3x4xi32>) {
    ^bb0(%in: i8, %out: i32):
      %20 = arith.extsi %in : i8 to i32
      linalg.yield %20 : i32
    } -> tensor<3x4xi32>
    %11 = tensor.empty() : tensor<3xi32>
    %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<3xi32>) -> tensor<3xi32>
    %13 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<3x4xi32>) outs(%12 : tensor<3xi32>) {
    ^bb0(%in: i32, %out: i32):
      %20 = arith.addi %in, %out : i32
      linalg.yield %20 : i32
    } -> tensor<3xi32>
    %14 = tensor.empty() : tensor<4x5xi32>
    %15 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%arg1 : tensor<4x5xi8>) outs(%14 : tensor<4x5xi32>) {
    ^bb0(%in: i8, %out: i32):
      %20 = arith.extsi %in : i8 to i32
      linalg.yield %20 : i32
    } -> tensor<4x5xi32>
    %16 = tensor.empty() : tensor<5xi32>
    %17 = linalg.fill ins(%c0_i32 : i32) outs(%16 : tensor<5xi32>) -> tensor<5xi32>
    %18 = linalg.generic {indexing_maps = [#map3, #map5], iterator_types = ["reduction", "parallel"]} ins(%15 : tensor<4x5xi32>) outs(%17 : tensor<5xi32>) {
    ^bb0(%in: i32, %out: i32):
      %20 = arith.addi %in, %out : i32
      linalg.yield %20 : i32
    } -> tensor<5xi32>
    %19 = linalg.generic {indexing_maps = [#map3, #map4, #map5, #map6, #map6, #map6, #map3], iterator_types = ["parallel", "parallel"]} ins(%8, %13, %18, %arg2, %arg3, %c4_i32 : tensor<3x5xi32>, tensor<3xi32>, tensor<5xi32>, i32, i32, i32) outs(%0 : tensor<3x5xi32>) {
    ^bb0(%in: i32, %in_0: i32, %in_1: i32, %in_2: i32, %in_3: i32, %in_4: i32, %out: i32):
      %20 = arith.muli %in_0, %in_3 : i32
      %21 = arith.muli %in_1, %in_2 : i32
      %22 = arith.addi %20, %21 : i32
      %23 = arith.muli %in_2, %in_3 : i32
      %24 = arith.muli %in_4, %23 : i32
      %25 = arith.subi %in, %22 : i32
      %26 = arith.addi %25, %24 : i32
      linalg.yield %26 : i32
    } -> tensor<3x5xi32>
    check.expect_eq(%2, %19) : tensor<3x5xi32>
    util.return
  }
  util.func private @check_one_quantized_matmul_as_matmul_dynamic(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>) {
    %c4_i32 = arith.constant 4 : i32
    %c3 = arith.constant 3 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c-128_i32 = arith.constant -128 : i32
    %c127_i32 = arith.constant 127 : i32
    %c0_i32 = arith.constant 0 : i32
    %0 = flow.tensor.tie_shape %arg0 : tensor<?x?xi8>{%c3, %c4}
    %1 = flow.tensor.tie_shape %arg1 : tensor<?x?xi8>{%c4, %c5}
    %2 = tensor.empty() : tensor<3x5xi32>
    %3 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<3x5xi32>) -> tensor<3x5xi32>
    %cast = tensor.cast %0 : tensor<?x?xi8> to tensor<3x?xi8>
    %cast_0 = tensor.cast %1 : tensor<?x?xi8> to tensor<?x5xi8>
    %4 = linalg.quantized_matmul ins(%cast, %cast_0, %c-128_i32, %c127_i32 : tensor<3x?xi8>, tensor<?x5xi8>, i32, i32) outs(%3 : tensor<3x5xi32>) -> tensor<3x5xi32>
    %cast_1 = tensor.cast %4 : tensor<3x5xi32> to tensor<?x?xi32>
    %5 = iree_encoding.set_encoding %0 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %6 = iree_encoding.set_encoding %1 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %7 = tensor.empty() : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %8 = linalg.fill ins(%c0_i32 : i32) outs(%7 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %cast_2 = tensor.cast %5 : tensor<?x?xi8, #iree_encoding.encoding<role =  LHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>> to tensor<3x?xi8>
    %cast_3 = tensor.cast %6 : tensor<?x?xi8, #iree_encoding.encoding<role =  RHS, element_types = [i8, i8, i32], original_type = tensor<?x?xi8>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>> to tensor<?x5xi8>
    %9 = linalg.matmul ins(%cast_2, %cast_3 : tensor<3x?xi8>, tensor<?x5xi8>) outs(%8 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>) -> tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %cast_4 = tensor.cast %9 : tensor<3x5xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>> to tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>>
    %10 = iree_encoding.unset_encoding %cast_4 : tensor<?x?xi32, #iree_encoding.encoding<role =  RESULT, element_types = [i8, i8, i32], original_type = tensor<?x?xi32>, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>> -> tensor<?x?xi32>
    %extracted_slice = tensor.extract_slice %10[0, 0] [3, 5] [1, 1] : tensor<?x?xi32> to tensor<3x5xi32>
    %11 = tensor.empty() : tensor<3x4xi32>
    %cast_5 = tensor.cast %0 : tensor<?x?xi8> to tensor<3x4xi8>
    %12 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%cast_5 : tensor<3x4xi8>) outs(%11 : tensor<3x4xi32>) {
    ^bb0(%in: i8, %out: i32):
      %22 = arith.extsi %in : i8 to i32
      linalg.yield %22 : i32
    } -> tensor<3x4xi32>
    %13 = tensor.empty() : tensor<3xi32>
    %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<3xi32>) -> tensor<3xi32>
    %15 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "reduction"]} ins(%12 : tensor<3x4xi32>) outs(%14 : tensor<3xi32>) {
    ^bb0(%in: i32, %out: i32):
      %22 = arith.addi %in, %out : i32
      linalg.yield %22 : i32
    } -> tensor<3xi32>
    %16 = tensor.empty() : tensor<4x5xi32>
    %cast_6 = tensor.cast %1 : tensor<?x?xi8> to tensor<4x5xi8>
    %17 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%cast_6 : tensor<4x5xi8>) outs(%16 : tensor<4x5xi32>) {
    ^bb0(%in: i8, %out: i32):
      %22 = arith.extsi %in : i8 to i32
      linalg.yield %22 : i32
    } -> tensor<4x5xi32>
    %18 = tensor.empty() : tensor<5xi32>
    %19 = linalg.fill ins(%c0_i32 : i32) outs(%18 : tensor<5xi32>) -> tensor<5xi32>
    %20 = linalg.generic {indexing_maps = [#map3, #map5], iterator_types = ["reduction", "parallel"]} ins(%17 : tensor<4x5xi32>) outs(%19 : tensor<5xi32>) {
    ^bb0(%in: i32, %out: i32):
      %22 = arith.addi %in, %out : i32
      linalg.yield %22 : i32
    } -> tensor<5xi32>
    %21 = linalg.generic {indexing_maps = [#map3, #map4, #map5, #map6, #map6, #map6, #map3], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %15, %20, %c-128_i32, %c127_i32, %c4_i32 : tensor<3x5xi32>, tensor<3xi32>, tensor<5xi32>, i32, i32, i32) outs(%2 : tensor<3x5xi32>) {
    ^bb0(%in: i32, %in_8: i32, %in_9: i32, %in_10: i32, %in_11: i32, %in_12: i32, %out: i32):
      %22 = arith.muli %in_8, %in_11 : i32
      %23 = arith.muli %in_9, %in_10 : i32
      %24 = arith.addi %22, %23 : i32
      %25 = arith.muli %in_10, %in_11 : i32
      %26 = arith.muli %in_12, %25 : i32
      %27 = arith.subi %in, %24 : i32
      %28 = arith.addi %27, %26 : i32
      linalg.yield %28 : i32
    } -> tensor<3x5xi32>
    %cast_7 = tensor.cast %21 : tensor<3x5xi32> to tensor<?x?xi32>
    check.expect_eq(%cast_1, %cast_7) : tensor<?x?xi32>
    util.return
  }
  util.func public @test_quantized_matmul_as_matmul() attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @test_quantized_matmul_as_matmul() -> ()"}} {
    %cst = arith.constant dense<[[123, -125, 127, -128, 91], [-70, 37, 0, -40, 57], [-128, 127, -121, -100, 99], [127, 105, 83, 51, -128]]> : tensor<4x5xi8>
    %cst_0 = arith.constant dense<[[127, -128, 0, 51], [-47, 101, -119, 0], [-128, 89, -63, 127]]> : tensor<3x4xi8>
    %cst_1 = arith.constant dense<[[5, 4, 3, 2, 9], [1, 0, -1, -2, 8], [-3, -4, -5, -6, 7], [2, 3, 5, 7, 11]]> : tensor<4x5xi8>
    %c127_i32 = arith.constant 127 : i32
    %c-128_i32 = arith.constant -128 : i32
    %c-57_i32 = arith.constant -57 : i32
    %c41_i32 = arith.constant 41 : i32
    %c3_i32 = arith.constant 3 : i32
    %c-2_i32 = arith.constant -2 : i32
    %c0_i32 = arith.constant 0 : i32
    %cst_2 = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]> : tensor<3x4xi8>
    %0 = util.optimization_barrier %cst_2 : tensor<3x4xi8>
    %1 = util.optimization_barrier %cst_1 : tensor<4x5xi8>
    %2 = util.optimization_barrier %cst_0 : tensor<3x4xi8>
    %3 = util.optimization_barrier %cst : tensor<4x5xi8>
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c0_i32, %c0_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c0_i32, %c3_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c-2_i32, %c0_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%0, %1, %c-2_i32, %c3_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%2, %3, %c41_i32, %c-57_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    util.call @check_one_quantized_matmul_as_matmul_3x4x5(%2, %3, %c-128_i32, %c127_i32) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32) -> ()
    %cast = tensor.cast %0 : tensor<3x4xi8> to tensor<?x?xi8>
    %cast_3 = tensor.cast %1 : tensor<4x5xi8> to tensor<?x?xi8>
    util.call @check_one_quantized_matmul_as_matmul_dynamic(%cast, %cast_3) : (tensor<?x?xi8>, tensor<?x?xi8>) -> ()
    util.return
  }
}