Abhishek-Varma/input.mlir Secret

## input.mlir
// -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() {
      %cst = arith.constant 0.000000e+00 : bf16
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
      %5 = tensor.empty() : tensor<128x128xbf16>
      %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
      %7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
      return
    }
  }
}

// -----// IR Dump Before TypePropagation (iree-codegen-type-propagation) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() {
    %cst = arith.constant 0.000000e+00 : bf16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
    %5 = tensor.empty() : tensor<128x128xbf16>
    %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
    return
  }
}

// -----// IR Dump Before AMDAIELoweringStrategy (iree-amdaie-lowering-strategy) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() {
    %cst = arith.constant 0.000000e+00 : bf16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
    %5 = tensor.empty() : tensor<128x128xbf16>
    %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
    return
  }
}

// -----// IR Dump Before LowerExecutableUsingTransformDialect (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %cst = arith.constant 0.000000e+00 : bf16
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
    %5 = tensor.empty() : tensor<128x128xbf16>
    %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
    return
  }
}

// -----// IR Dump Before AMDAIELowerExecutableTarget (iree-amdaie-lower-executable-target) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
  %8 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %9 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_1 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%9 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_1 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%7 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_1 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%7 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_1 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%7 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEPropagateDataLayout (iree-amdaie-propagate-data-layout) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_1 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %8 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %9 = tensor.empty() : tensor<4x2x64x64xbf16>
    %10 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %11 = tensor.empty() : tensor<2x2x64x64xbf16>
    %pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<128x128xbf16> -> tensor<2x2x64x64xbf16>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x4x64x64xbf16>, tensor<4x2x64x64xbf16>) outs(%pack_3 : tensor<2x2x64x64xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_4: bf16, %out: bf16):
      %13 = arith.mulf %in, %in_4 : bf16
      %14 = arith.addf %out, %13 : bf16
      linalg.yield %14 : bf16
    } -> tensor<2x2x64x64xbf16>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_1 : tensor<128x128xbf16>) -> tensor<128x128xbf16>
    %8 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %9 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %9 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %10 = tensor.empty() : tensor<2x2x64x64xbf16>
    %pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<128x128xbf16> -> tensor<2x2x64x64xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x4x64x64xbf16>, tensor<4x2x64x64xbf16>) outs(%pack_3 : tensor<2x2x64x64xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_4: bf16, %out: bf16):
      %12 = arith.mulf %in, %in_4 : bf16
      %13 = arith.addf %out, %12 : bf16
      linalg.yield %13 : bf16
    } -> tensor<2x2x64x64xbf16>
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %9 = tensor.empty() : tensor<2x2x64x64xbf16>
    %10 = linalg.fill ins(%cst : bf16) outs(%9 : tensor<2x2x64x64xbf16>) -> tensor<2x2x64x64xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x4x64x64xbf16>, tensor<4x2x64x64xbf16>) outs(%10 : tensor<2x2x64x64xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_3: bf16, %out: bf16):
      %12 = arith.mulf %in, %in_3 : bf16
      %13 = arith.addf %out, %12 : bf16
      linalg.yield %13 : bf16
    } -> tensor<2x2x64x64xbf16>
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %9 = tensor.empty() : tensor<2x2x64x64xbf16>
    %10 = linalg.fill ins(%cst : bf16) outs(%9 : tensor<2x2x64x64xbf16>) -> tensor<2x2x64x64xbf16>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x4x64x64xbf16>, tensor<4x2x64x64xbf16>) outs(%10 : tensor<2x2x64x64xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_3: bf16, %out: bf16):
      %12 = arith.mulf %in, %in_3 : bf16
      %13 = arith.addf %out, %12 : bf16
      linalg.yield %13 : bf16
    } -> tensor<2x2x64x64xbf16>
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %9 = tensor.empty() : tensor<2x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %11 = linalg.fill ins(%cst : bf16) outs(%10 : tensor<2x2x64x64xbf16>) -> tensor<2x2x64x64xbf16>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x4x64x64xbf16>, tensor<4x2x64x64xbf16>) outs(%11 : tensor<2x2x64x64xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_3: bf16, %out: bf16):
      %13 = arith.mulf %in, %in_3 : bf16
      %14 = arith.addf %out, %13 : bf16
      linalg.yield %14 : bf16
    } -> tensor<2x2x64x64xbf16>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %9 = tensor.empty() : tensor<2x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %11 = linalg.fill ins(%cst : bf16) outs(%10 : tensor<2x2x64x64xbf16>) -> tensor<2x2x64x64xbf16>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x4x64x64xbf16>, tensor<4x2x64x64xbf16>) outs(%11 : tensor<2x2x64x64xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_3: bf16, %out: bf16):
      %13 = arith.mulf %in, %in_3 : bf16
      %14 = arith.addf %out, %13 : bf16
      linalg.yield %14 : bf16
    } -> tensor<2x2x64x64xbf16>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEPropagateDataLayout (iree-amdaie-propagate-data-layout) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %9 = tensor.empty() : tensor<2x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %11 = linalg.fill ins(%cst : bf16) outs(%10 : tensor<2x2x64x64xbf16>) -> tensor<2x2x64x64xbf16>
    %12 = tensor.empty() : tensor<2x4x16x8x4x8xbf16>
    %13 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %14 = tensor.empty() : tensor<4x2x8x16x4x8xbf16>
    %15 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %16 = tensor.empty() : tensor<2x2x16x16x4x4xbf16>
    %17 = tensor.empty() : tensor<2x2x16x16x4x4xbf16>
    %pack_5 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %17 : tensor<2x2x64x64xbf16> -> tensor<2x2x16x16x4x4xbf16>
    %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x4x8x16x4x8xbf16>, tensor<4x2x16x8x8x4xbf16>) outs(%pack_5 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_7: bf16, %out: bf16):
      %19 = arith.mulf %in, %in_7 : bf16
      %20 = arith.addf %out, %19 : bf16
      linalg.yield %20 : bf16
    } -> tensor<2x2x16x16x4x4xbf16>
    %unpack = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = linalg.fill ins(%cst : bf16) outs(%9 : tensor<2x2x64x64xbf16>) -> tensor<2x2x64x64xbf16>
    %11 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %12 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %13 = tensor.empty() : tensor<2x2x16x16x4x4xbf16>
    %pack_5 = tensor.pack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %13 : tensor<2x2x64x64xbf16> -> tensor<2x2x16x16x4x4xbf16>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x4x8x16x4x8xbf16>, tensor<4x2x16x8x8x4xbf16>) outs(%pack_5 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_7: bf16, %out: bf16):
      %15 = arith.mulf %in, %in_7 : bf16
      %16 = arith.addf %out, %15 : bf16
      linalg.yield %16 : bf16
    } -> tensor<2x2x16x16x4x4xbf16>
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %10 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %12 = tensor.empty() : tensor<2x2x16x16x4x4xbf16>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x4x8x16x4x8xbf16>, tensor<4x2x16x8x8x4xbf16>) outs(%13 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_6: bf16, %out: bf16):
      %15 = arith.mulf %in, %in_6 : bf16
      %16 = arith.addf %out, %15 : bf16
      linalg.yield %16 : bf16
    } -> tensor<2x2x16x16x4x4xbf16>
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_5 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_5 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %12 = tensor.empty() : tensor<2x2x16x16x4x4xbf16>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x4x8x16x4x8xbf16>, tensor<4x2x16x8x8x4xbf16>) outs(%13 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_6: bf16, %out: bf16):
      %15 = arith.mulf %in, %in_6 : bf16
      %16 = arith.addf %out, %15 : bf16
      linalg.yield %16 : bf16
    } -> tensor<2x2x16x16x4x4xbf16>
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_5 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_5 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %12 = tensor.empty() : tensor<2x2x16x16x4x4xbf16>
    %alloc_5 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %14 = linalg.fill ins(%cst : bf16) outs(%13 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x4x8x16x4x8xbf16>, tensor<4x2x16x8x8x4xbf16>) outs(%14 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_7: bf16, %out: bf16):
      %16 = arith.mulf %in, %in_7 : bf16
      %17 = arith.addf %out, %16 : bf16
      linalg.yield %17 : bf16
    } -> tensor<2x2x16x16x4x4xbf16>
    %unpack = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %12 = tensor.empty() : tensor<2x2x16x16x4x4xbf16>
    %alloc_5 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %14 = linalg.fill ins(%cst : bf16) outs(%13 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x4x8x16x4x8xbf16>, tensor<4x2x16x8x8x4xbf16>) outs(%14 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: bf16, %in_8: bf16, %out: bf16):
      %17 = arith.mulf %in, %in_8 : bf16
      %18 = arith.addf %out, %17 : bf16
      linalg.yield %18 : bf16
    } -> tensor<2x2x16x16x4x4xbf16>
    %c0_6 = arith.constant 0 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %16 = scf.for %arg3 = %c0_6 to %c4 step %c1 iter_args(%arg4 = %14) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_8 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_9 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %extracted_slice_10 = tensor.extract_slice %arg4[0, 0, 0, 0, 0, 0] [2, 2, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<2x2x16x16x4x4xbf16>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_8, %extracted_slice_9 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%extracted_slice_10 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_11: bf16, %out: bf16):
        %18 = arith.mulf %in, %in_11 : bf16
        %19 = arith.addf %out, %18 : bf16
        linalg.yield %19 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      %inserted_slice = tensor.insert_slice %17 into %arg4[0, 0, 0, 0, 0, 0] [2, 2, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      scf.yield %inserted_slice : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %alloc_5 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_9: bf16, %out: bf16):
        %16 = arith.mulf %in, %in_9 : bf16
        %17 = arith.addf %out, %16 : bf16
        linalg.yield %17 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      scf.yield %15 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %alloc_5 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_9: bf16, %out: bf16):
        %16 = arith.mulf %in, %in_9 : bf16
        %17 = arith.addf %out, %16 : bf16
        linalg.yield %17 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      scf.yield %15 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %alloc_5 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_9: bf16, %out: bf16):
        %16 = arith.mulf %in, %in_9 : bf16
        %17 = arith.addf %out, %16 : bf16
        linalg.yield %17 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      scf.yield %15 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<128x256xbf16> -> tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %8 : tensor<256x128xbf16> -> tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x4x64x64xbf16> -> tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<4x2x64x64xbf16> -> tensor<4x2x16x8x8x4xbf16>
    %alloc_5 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %15] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %extracted_slice_8 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 64, 64] [1, 1, 1, 1] : tensor<2x4x64x64xbf16> to tensor<2x1x64x64xbf16>
      %pack_9 = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_8 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_10 = tensor.extract_slice %pack[0, %arg3, 0, 0] [2, 1, 64, 64] [1, 1, 1, 1] : tensor<2x4x64x64xbf16> to tensor<2x1x64x64xbf16>
      %extracted_slice_11 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %pack_12 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_13 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice_0[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %extracted_slice_15 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 64, 64] [1, 1, 1, 1] : tensor<4x2x64x64xbf16> to tensor<1x2x64x64xbf16>
      %pack_16 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %extracted_slice_17 = tensor.extract_slice %pack_2[%arg3, 0, 0, 0] [1, 2, 64, 64] [1, 1, 1, 1] : tensor<4x2x64x64xbf16> to tensor<1x2x64x64xbf16>
      %extracted_slice_18 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %pack_19 = tensor.pack %pack_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_18 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %extracted_slice_20 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_19 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_21: bf16, %out: bf16):
        %18 = arith.mulf %in, %in_21 : bf16
        %19 = arith.addf %out, %18 : bf16
        linalg.yield %19 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      scf.yield %17 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 64, 64] [1, 1, 1, 1] : tensor<2x4x64x64xbf16> to tensor<2x1x64x64xbf16>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_5 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_6 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %extracted_slice_9 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 64, 64] [1, 1, 1, 1] : tensor<4x2x64x64xbf16> to tensor<1x2x64x64xbf16>
      %pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_9 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %extracted_slice_11 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_7, %pack_12 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_13: bf16, %out: bf16):
        %18 = arith.mulf %in, %in_13 : bf16
        %19 = arith.addf %out, %18 : bf16
        linalg.yield %19 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      scf.yield %17 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 64, 64] [1, 1, 1, 1] : tensor<2x4x64x64xbf16> to tensor<2x1x64x64xbf16>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_5 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_6 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%15, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %extracted_slice_9 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 64, 64] [1, 1, 1, 1] : tensor<4x2x64x64xbf16> to tensor<1x2x64x64xbf16>
      %pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_9 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %extracted_slice_11 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_7, %pack_12 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_13: bf16, %out: bf16):
        %17 = arith.mulf %in, %in_13 : bf16
        %18 = arith.addf %out, %17 : bf16
        linalg.yield %18 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 64, 64] [1, 1, 1, 1] : tensor<2x4x64x64xbf16> to tensor<2x1x64x64xbf16>
      %alloc_6 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %16 = bufferization.to_tensor %alloc_6 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %16 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_7 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %pack_8 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_7 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_0[%15, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %extracted_slice_10 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 64, 64] [1, 1, 1, 1] : tensor<4x2x64x64xbf16> to tensor<1x2x64x64xbf16>
      %alloc_11 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %extracted_slice_13 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %pack_14 = tensor.pack %pack_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_14 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_15: bf16, %out: bf16):
        %19 = arith.mulf %in, %in_15 : bf16
        %20 = arith.addf %out, %19 : bf16
        linalg.yield %20 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      memref.dealloc %alloc_6 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_11 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %18 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = tensor.empty() : tensor<2x4x64x64xbf16>
    %8 = tensor.empty() : tensor<4x2x64x64xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %10 = tensor.empty() : tensor<2x4x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<4x2x16x8x8x4xbf16>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %13 = linalg.fill ins(%cst : bf16) outs(%12 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %14 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 64, 64] [1, 1, 1, 1] : tensor<2x4x64x64xbf16> to tensor<2x1x64x64xbf16>
      %alloc_6 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %16 = bufferization.to_tensor %alloc_6 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %16 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_7 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x4x8x16x4x8xbf16> to tensor<2x1x8x16x4x8xbf16>
      %pack_8 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_7 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_0[%15, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %extracted_slice_10 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 64, 64] [1, 1, 1, 1] : tensor<4x2x64x64xbf16> to tensor<1x2x64x64xbf16>
      %alloc_11 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %extracted_slice_13 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<4x2x16x8x8x4xbf16> to tensor<1x2x16x8x8x4xbf16>
      %pack_14 = tensor.pack %pack_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_14 : tensor<2x1x8x16x4x8xbf16>, tensor<1x2x16x8x8x4xbf16>) outs(%arg4 : tensor<2x2x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_15: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_15 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<2x2x16x16x4x4xbf16>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_15 = tensor.extract_slice %pack_8[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_16 = tensor.extract_slice %pack_14[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_15, %extracted_slice_16 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_17 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_18: bf16, %out: bf16):
          %21 = arith.mulf %in, %in_18 : bf16
          %22 = arith.addf %out, %21 : bf16
          linalg.yield %22 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_6 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_11 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %19 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %alloc_8 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_13 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_14: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_14 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %alloc_8 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_13 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_14: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_14 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %alloc_8 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_13 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_14: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_14 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x64x64xbf16> -> tensor<2x1x8x16x4x8xbf16>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %alloc_8 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x64x64xbf16> -> tensor<1x2x16x8x8x4xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_12 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %pack_13 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_12 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_14 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_15 = tensor.extract_slice %pack_9[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_16 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %pack_17 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_16 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_18 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_19 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_20: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_20 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %alloc_7 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %pack_11 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_10 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_12 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_13 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_11, %pack_14 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_16: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_16 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_7 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %alloc_7 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %pack_11 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_10 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_12 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_13 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_11, %pack_14 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_16: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_16 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_7 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before HoistStaticallyBoundAllocations (iree-hoist-statically-bound-allocations) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %alloc = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %alloc_7 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %alloc_11 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_13 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_14 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %alloc_15 = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %18 = bufferization.to_tensor %alloc_15 restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_16 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_16 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_17 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_18: bf16, %out: bf16):
          %20 = arith.mulf %in, %in_18 : bf16
          %21 = arith.addf %out, %20 : bf16
          linalg.yield %21 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        memref.dealloc %alloc_11 : memref<1x1x8x16x4x8xbf16, 2 : i32>
        memref.dealloc %alloc_15 : memref<1x1x16x8x8x4xbf16, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
      memref.dealloc %alloc_7 : memref<1x2x64x64xbf16, 1 : i32>
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    memref.dealloc %alloc : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = tensor.empty() : tensor<2x1x8x16x4x8xbf16>
    %11 = tensor.empty() : tensor<1x2x16x8x8x4xbf16>
    %12 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_12 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 8, 16, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x8x16x4x8xbf16> to tensor<1x1x8x16x4x8xbf16>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_14 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %extracted_slice_15 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 16, 8, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x16x8x8x4xbf16> to tensor<1x1x16x8x8x4xbf16>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_16 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_16 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_17 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_18: bf16, %out: bf16):
          %20 = arith.mulf %in, %in_18 : bf16
          %21 = arith.addf %out, %20 : bf16
          linalg.yield %21 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %11 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %11] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %12 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%11, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %13 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %14 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %15 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_12 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_13 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %16 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_14 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_16: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_16 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %14 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIEPeelForLoop (iree-amdaie-peel-for-loop) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %10 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %11 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %11] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %12 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%11, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %13 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %14 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %15 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_12 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_13 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %16 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_14 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_16: bf16, %out: bf16):
          %18 = arith.mulf %in, %in_16 : bf16
          %19 = arith.addf %out, %18 : bf16
          linalg.yield %19 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %14 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %c1_7 = arith.constant 1 : index
    %10 = scf.for %arg3 = %c0 to %c1_7 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_17: bf16, %out: bf16):
          %20 = arith.mulf %in, %in_17 : bf16
          %21 = arith.addf %out, %20 : bf16
          linalg.yield %21 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %c3 = arith.constant 3 : index
    %11 = scf.for %arg3 = %c1_7 to %c3 step %c1 iter_args(%arg4 = %10) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_17: bf16, %out: bf16):
          %20 = arith.mulf %in, %in_17 : bf16
          %21 = arith.addf %out, %20 : bf16
          linalg.yield %21 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %12 = scf.for %arg3 = %c3 to %c4 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x16x16x4x4xbf16>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_17: bf16, %out: bf16):
          %20 = arith.mulf %in, %in_17 : bf16
          %21 = arith.addf %out, %20 : bf16
          linalg.yield %21 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x16x16x4x4xbf16>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_8 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_8 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIEFuseFillIntoForall (iree-amdaie-fuse-fill-into-forall) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %9) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_17 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_20: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_20 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x16x16x4x4xbf16>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %17] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_17 = tensor.extract_slice %extracted_slice_5[%17, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %19 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_20, %pack_22 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_23 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_24: bf16, %out: bf16):
          %24 = arith.mulf %in, %in_24 : bf16
          %25 = arith.addf %out, %24 : bf16
          linalg.yield %25 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %16 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_15 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_17 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_20: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_20 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_14 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_14 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIEFuseConsumerIntoLoop (iree-amdaie-fuse-consumer-into-loop) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_17 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_19 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %extracted_slice_20 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_21: bf16, %out: bf16):
        %21 = arith.mulf %in, %in_21 : bf16
        %22 = arith.addf %out, %21 : bf16
        linalg.yield %22 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x16x16x4x4xbf16>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %17] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_17 = tensor.extract_slice %extracted_slice_5[%17, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %19 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_20, %pack_22 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_23 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_24: bf16, %out: bf16):
          %24 = arith.mulf %in, %in_24 : bf16
          %25 = arith.addf %out, %24 : bf16
          linalg.yield %25 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %16 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_15 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_17 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_20: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_20 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x16x16x4x4xbf16> -> tensor<2x2x64x64xbf16>
    %unpack_14 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_14 into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_20: bf16, %out: bf16):
        %21 = arith.mulf %in, %in_20 : bf16
        %22 = arith.addf %out, %21 : bf16
        linalg.yield %22 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x16x16x4x4xbf16>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %17] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%17, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %19 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_23: bf16, %out: bf16):
          %24 = arith.mulf %in, %in_23 : bf16
          %25 = arith.addf %out, %24 : bf16
          linalg.yield %25 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %16:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_22: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_22 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      %inserted_slice = tensor.insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %extracted_slice_20 = tensor.extract_slice %inserted_slice[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %unpack_21 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_20: bf16, %out: bf16):
        %21 = arith.mulf %in, %in_20 : bf16
        %22 = arith.addf %out, %21 : bf16
        linalg.yield %22 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x16x16x4x4xbf16>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %17] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%17, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %19 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_23: bf16, %out: bf16):
          %24 = arith.mulf %in, %in_23 : bf16
          %25 = arith.addf %out, %24 : bf16
          linalg.yield %25 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %16:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_22: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_22 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      %inserted_slice = tensor.insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %extracted_slice_20 = tensor.extract_slice %inserted_slice[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %unpack_21 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIELowerToUKernels (iree-amdaie-lower-to-ukernels) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %9 = linalg.fill ins(%cst : bf16) outs(%8 : tensor<2x2x16x16x4x4xbf16>) -> tensor<2x2x16x16x4x4xbf16>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%19 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_20: bf16, %out: bf16):
        %21 = arith.mulf %in, %in_20 : bf16
        %22 = arith.addf %out, %21 : bf16
        linalg.yield %22 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x16x16x4x4xbf16>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %17] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%17, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %19 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_23: bf16, %out: bf16):
          %24 = arith.mulf %in, %in_23 : bf16
          %25 = arith.addf %out, %24 : bf16
          linalg.yield %25 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %15 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %16:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_22: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_22 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      %inserted_slice = tensor.insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %extracted_slice_20 = tensor.extract_slice %inserted_slice[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %unpack_21 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %9 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%18 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_19: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_19 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x16x16x4x4xbf16>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_23: bf16, %out: bf16):
          %23 = arith.mulf %in, %in_23 : bf16
          %24 = arith.addf %out, %23 : bf16
          linalg.yield %24 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_21: bf16, %out: bf16):
        %19 = arith.mulf %in, %in_21 : bf16
        %20 = arith.addf %out, %19 : bf16
        linalg.yield %20 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIEInsertLoopsForVectorization (iree-amdaie-insert-loops-for-vectorization) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %9 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%18 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_19: bf16, %out: bf16):
        %20 = arith.mulf %in, %in_19 : bf16
        %21 = arith.addf %out, %20 : bf16
        linalg.yield %21 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x16x16x4x4xbf16>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: bf16, %in_23: bf16, %out: bf16):
          %23 = arith.mulf %in, %in_23 : bf16
          %24 = arith.addf %out, %23 : bf16
          linalg.yield %24 : bf16
        } -> tensor<1x1x16x16x4x4xbf16>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x8x16x4x8xbf16>, tensor<1x1x16x8x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: bf16, %in_21: bf16, %out: bf16):
        %19 = arith.mulf %in, %in_21 : bf16
        %20 = arith.addf %out, %19 : bf16
        linalg.yield %20 : bf16
      } -> tensor<1x1x16x16x4x4xbf16>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before AMDAIEVectorization (iree-amdaie-vectorization) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c1_0 = arith.constant 1 : index
  %c1_1 = arith.constant 1 : index
  %c1_2 = arith.constant 1 : index
  %c1_3 = arith.constant 1 : index
  %c1_4 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c0_5 = arith.constant 0 : index
  %c0_6 = arith.constant 0 : index
  %c1_7 = arith.constant 1 : index
  %c1_8 = arith.constant 1 : index
  %c1_9 = arith.constant 1 : index
  %c1_10 = arith.constant 1 : index
  %c1_11 = arith.constant 1 : index
  %c1_12 = arith.constant 1 : index
  %c0_13 = arith.constant 0 : index
  %c0_14 = arith.constant 0 : index
  %c0_15 = arith.constant 0 : index
  %c1_16 = arith.constant 1 : index
  %c1_17 = arith.constant 1 : index
  %c1_18 = arith.constant 1 : index
  %c1_19 = arith.constant 1 : index
  %c1_20 = arith.constant 1 : index
  %c1_21 = arith.constant 1 : index
  %c0_22 = arith.constant 0 : index
  %c0_23 = arith.constant 0 : index
  %c0_24 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %c0_25 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %c1_26 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_27 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_28 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_29 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_30 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_31 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0_25) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0_25) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0_25) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_32 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_33 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_31 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_30 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %extracted_slice_34 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %9 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_34 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %9 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_35 = tensor.extract_slice %extracted_slice_32[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %10 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_36 = tensor.pack %extracted_slice_35 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_41 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_42 = tensor.pack %extracted_slice_41 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_43 = tensor.extract_slice %pack_36[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_45 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_45 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %c0_46 = arith.constant 0 : index
      %c1_47 = arith.constant 1 : index
      %c1_48 = arith.constant 1 : index
      %c0_49 = arith.constant 0 : index
      %c1_50 = arith.constant 1 : index
      %c1_51 = arith.constant 1 : index
      %c0_52 = arith.constant 0 : index
      %c1_53 = arith.constant 1 : index
      %c1_54 = arith.constant 1 : index
      %c0_55 = arith.constant 0 : index
      %c16 = arith.constant 16 : index
      %c1_56 = arith.constant 1 : index
      %c0_57 = arith.constant 0 : index
      %c16_58 = arith.constant 16 : index
      %c1_59 = arith.constant 1 : index
      %c0_60 = arith.constant 0 : index
      %c8 = arith.constant 8 : index
      %c1_61 = arith.constant 1 : index
      %19 = scf.for %arg6 = %c0_46 to %c1_47 step %c1_48 iter_args(%arg7 = %18) -> (tensor<1x1x16x16x4x4xbf16>) {
        %20 = scf.for %arg8 = %c0_49 to %c1_50 step %c1_51 iter_args(%arg9 = %arg7) -> (tensor<1x1x16x16x4x4xbf16>) {
          %21 = scf.for %arg10 = %c0_52 to %c1_53 step %c1_54 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %22 = scf.for %arg12 = %c0_55 to %c16 step %c1_56 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %23 = scf.for %arg14 = %c0_57 to %c16_58 step %c1_59 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %24 = scf.for %arg16 = %c0_60 to %c8 step %c1_61 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_62 = tensor.extract_slice %pack_42[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_63 = tensor.extract_slice %pack_44[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_64 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_62, %extracted_slice_63 : tensor<1x1x1x1x4x8xbf16>, tensor<1x1x1x1x8x4xbf16>) outs(%extracted_slice_64 : tensor<1x1x1x1x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: bf16, %in_65: bf16, %out: bf16):
                    %26 = arith.mulf %in, %in_65 : bf16
                    %27 = arith.addf %out, %26 : bf16
                    linalg.yield %27 : bf16
                  } -> tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %25 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1_26 to %c3 step %c1_26 iter_args(%arg4 = %11) -> (tensor<2x2x16x16x4x4xbf16>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_41 = tensor.extract_slice %extracted_slice[0, %16] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %17 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_42 = tensor.pack %extracted_slice_41 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_43 = tensor.extract_slice %extracted_slice_32[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %18 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_45 = tensor.extract_slice %pack_42[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %20 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_46 = tensor.pack %extracted_slice_45 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_47 = tensor.extract_slice %pack_44[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_48 = tensor.pack %extracted_slice_47 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_49 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %c0_50 = arith.constant 0 : index
        %c1_51 = arith.constant 1 : index
        %c1_52 = arith.constant 1 : index
        %c0_53 = arith.constant 0 : index
        %c1_54 = arith.constant 1 : index
        %c1_55 = arith.constant 1 : index
        %c0_56 = arith.constant 0 : index
        %c1_57 = arith.constant 1 : index
        %c1_58 = arith.constant 1 : index
        %c0_59 = arith.constant 0 : index
        %c16 = arith.constant 16 : index
        %c1_60 = arith.constant 1 : index
        %c0_61 = arith.constant 0 : index
        %c16_62 = arith.constant 16 : index
        %c1_63 = arith.constant 1 : index
        %c0_64 = arith.constant 0 : index
        %c8 = arith.constant 8 : index
        %c1_65 = arith.constant 1 : index
        %22 = scf.for %arg8 = %c0_50 to %c1_51 step %c1_52 iter_args(%arg9 = %extracted_slice_49) -> (tensor<1x1x16x16x4x4xbf16>) {
          %23 = scf.for %arg10 = %c0_53 to %c1_54 step %c1_55 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %24 = scf.for %arg12 = %c0_56 to %c1_57 step %c1_58 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %25 = scf.for %arg14 = %c0_59 to %c16 step %c1_60 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %26 = scf.for %arg16 = %c0_61 to %c16_62 step %c1_63 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %27 = scf.for %arg18 = %c0_64 to %c8 step %c1_65 iter_args(%arg19 = %arg17) -> (tensor<1x1x16x16x4x4xbf16>) {
                    %extracted_slice_66 = tensor.extract_slice %pack_46[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                    %extracted_slice_67 = tensor.extract_slice %pack_48[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                    %extracted_slice_68 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                    %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_66, %extracted_slice_67 : tensor<1x1x1x1x4x8xbf16>, tensor<1x1x1x1x8x4xbf16>) outs(%extracted_slice_68 : tensor<1x1x1x1x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                    ^bb0(%in: bf16, %in_69: bf16, %out: bf16):
                      %29 = arith.mulf %in, %in_69 : bf16
                      %30 = arith.addf %out, %29 : bf16
                      linalg.yield %30 : bf16
                    } -> tensor<1x1x1x1x4x4xbf16>
                    %inserted_slice = tensor.insert_slice %28 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                    scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                  }
                  scf.yield %27 : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %26 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %25 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_37 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %13 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_38 = tensor.pack %extracted_slice_37 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %14 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_40 = tensor.pack %extracted_slice_39 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_41 = tensor.extract_slice %pack_38[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_42 = tensor.pack %extracted_slice_41 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_43 = tensor.extract_slice %pack_40[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_45 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %c0_46 = arith.constant 0 : index
      %c1_47 = arith.constant 1 : index
      %c1_48 = arith.constant 1 : index
      %c0_49 = arith.constant 0 : index
      %c1_50 = arith.constant 1 : index
      %c1_51 = arith.constant 1 : index
      %c0_52 = arith.constant 0 : index
      %c1_53 = arith.constant 1 : index
      %c1_54 = arith.constant 1 : index
      %c0_55 = arith.constant 0 : index
      %c16 = arith.constant 16 : index
      %c1_56 = arith.constant 1 : index
      %c0_57 = arith.constant 0 : index
      %c16_58 = arith.constant 16 : index
      %c1_59 = arith.constant 1 : index
      %c0_60 = arith.constant 0 : index
      %c8 = arith.constant 8 : index
      %c1_61 = arith.constant 1 : index
      %18 = scf.for %arg7 = %c0_46 to %c1_47 step %c1_48 iter_args(%arg8 = %extracted_slice_45) -> (tensor<1x1x16x16x4x4xbf16>) {
        %19 = scf.for %arg9 = %c0_49 to %c1_50 step %c1_51 iter_args(%arg10 = %arg8) -> (tensor<1x1x16x16x4x4xbf16>) {
          %20 = scf.for %arg11 = %c0_52 to %c1_53 step %c1_54 iter_args(%arg12 = %arg10) -> (tensor<1x1x16x16x4x4xbf16>) {
            %21 = scf.for %arg13 = %c0_55 to %c16 step %c1_56 iter_args(%arg14 = %arg12) -> (tensor<1x1x16x16x4x4xbf16>) {
              %22 = scf.for %arg15 = %c0_57 to %c16_58 step %c1_59 iter_args(%arg16 = %arg14) -> (tensor<1x1x16x16x4x4xbf16>) {
                %23 = scf.for %arg17 = %c0_60 to %c8 step %c1_61 iter_args(%arg18 = %arg16) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_64 = tensor.extract_slice %pack_42[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_65 = tensor.extract_slice %pack_44[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_66 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_64, %extracted_slice_65 : tensor<1x1x1x1x4x8xbf16>, tensor<1x1x1x1x8x4xbf16>) outs(%extracted_slice_66 : tensor<1x1x1x1x4x4xbf16>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 64], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: bf16, %in_67: bf16, %out: bf16):
                    %25 = arith.mulf %in, %in_67 : bf16
                    %26 = arith.addf %out, %25 : bf16
                    linalg.yield %26 : bf16
                  } -> tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %24 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %19 : tensor<1x1x16x16x4x4xbf16>
      }
      %extracted_slice_62 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %unpack_63 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_62 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_63 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_33 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_31 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_30 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_29 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_28 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_27 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = tensor.empty() : tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %9 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %19 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %18) -> (tensor<1x1x16x16x4x4xbf16>) {
        %20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x16x16x4x4xbf16>) {
          %21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %22 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %23 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %24 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_19 = tensor.extract_slice %pack_15[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_20 = tensor.extract_slice %pack_17[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_21 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %25 = vector.transfer_read %extracted_slice_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                  %26 = vector.transfer_read %extracted_slice_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                  %27 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                  %28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %25, %26, %27 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  %29 = vector.transfer_write %28, %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %29 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x16x16x4x4xbf16>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %22 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_22) -> (tensor<1x1x16x16x4x4xbf16>) {
          %23 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %24 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %25 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %26 = scf.for %arg16 = %c0 to %c16 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %27 = scf.for %arg18 = %c0 to %c8 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x16x16x4x4xbf16>) {
                    %extracted_slice_23 = tensor.extract_slice %pack_19[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                    %extracted_slice_24 = tensor.extract_slice %pack_21[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                    %extracted_slice_25 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                    %28 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                    %29 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                    %30 = vector.transfer_read %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                    %31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %29, %30 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                    %32 = vector.transfer_write %31, %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                    %inserted_slice = tensor.insert_slice %32 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                    scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                  }
                  scf.yield %27 : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %26 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %25 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_18) -> (tensor<1x1x16x16x4x4xbf16>) {
        %19 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x16x16x4x4xbf16>) {
          %20 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x16x16x4x4xbf16>) {
            %21 = scf.for %arg13 = %c0 to %c16 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x16x16x4x4xbf16>) {
              %22 = scf.for %arg15 = %c0 to %c16 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x16x16x4x4xbf16>) {
                %23 = scf.for %arg17 = %c0 to %c8 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_21 = tensor.extract_slice %pack_15[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_22 = tensor.extract_slice %pack_17[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_23 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %24 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                  %25 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                  %26 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                  %27 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %24, %25, %26 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  %28 = vector.transfer_write %27, %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %28 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %19 : tensor<1x1x16x16x4x4xbf16>
      }
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>> -> tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %9 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %19 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %18) -> (tensor<1x1x16x16x4x4xbf16>) {
        %20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x16x16x4x4xbf16>) {
          %21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %22 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %23 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %24 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_19 = tensor.extract_slice %pack_15[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_20 = tensor.extract_slice %pack_17[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_21 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %25 = vector.transfer_read %extracted_slice_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                  %26 = vector.transfer_read %extracted_slice_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                  %27 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                  %28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %25, %26, %27 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  %29 = vector.transfer_write %28, %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %29 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x16x16x4x4xbf16>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %22 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_22) -> (tensor<1x1x16x16x4x4xbf16>) {
          %23 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %24 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %25 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %26 = scf.for %arg16 = %c0 to %c16 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %27 = scf.for %arg18 = %c0 to %c8 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x16x16x4x4xbf16>) {
                    %extracted_slice_23 = tensor.extract_slice %pack_19[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                    %extracted_slice_24 = tensor.extract_slice %pack_21[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                    %extracted_slice_25 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                    %28 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                    %29 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                    %30 = vector.transfer_read %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                    %31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %29, %30 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                    %32 = vector.transfer_write %31, %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                    %inserted_slice = tensor.insert_slice %32 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                    scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                  }
                  scf.yield %27 : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %26 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %25 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_18) -> (tensor<1x1x16x16x4x4xbf16>) {
        %19 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x16x16x4x4xbf16>) {
          %20 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x16x16x4x4xbf16>) {
            %21 = scf.for %arg13 = %c0 to %c16 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x16x16x4x4xbf16>) {
              %22 = scf.for %arg15 = %c0 to %c16 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x16x16x4x4xbf16>) {
                %23 = scf.for %arg17 = %c0 to %c8 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_21 = tensor.extract_slice %pack_15[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_22 = tensor.extract_slice %pack_17[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_23 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %24 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                  %25 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                  %26 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                  %27 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %24, %25, %26 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  %28 = vector.transfer_write %27, %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %28 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %19 : tensor<1x1x16x16x4x4xbf16>
      }
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>> -> tensor<128x128xbf16>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) shared_outs(%arg2 = %5) -> (tensor<128x128xbf16>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [128, 256] [1, 1] : tensor<128x256xbf16> to tensor<128x256xbf16>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 128] [1, 1] : tensor<256x128xbf16> to tensor<256x128xbf16>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> to tensor<128x128xbf16>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x64x64xbf16, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %9 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %10 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x16x16x4x4xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = linalg.fill ins(%cst : bf16) outs(%extracted_slice_18 : tensor<1x1x16x16x4x4xbf16>) -> tensor<1x1x16x16x4x4xbf16>
      %19 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %18) -> (tensor<1x1x16x16x4x4xbf16>) {
        %20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x16x16x4x4xbf16>) {
          %21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %22 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %23 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %24 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_19 = tensor.extract_slice %pack_15[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_20 = tensor.extract_slice %pack_17[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_21 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %25 = vector.transfer_read %extracted_slice_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                  %26 = vector.transfer_read %extracted_slice_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                  %27 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                  %28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %25, %26, %27 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  %29 = vector.transfer_write %28, %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %29 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c3 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x16x16x4x4xbf16>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %17 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %18 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x16x16x4x4xbf16>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
        %22 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_22) -> (tensor<1x1x16x16x4x4xbf16>) {
          %23 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x16x16x4x4xbf16>) {
            %24 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x16x16x4x4xbf16>) {
              %25 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x16x16x4x4xbf16>) {
                %26 = scf.for %arg16 = %c0 to %c16 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %27 = scf.for %arg18 = %c0 to %c8 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x16x16x4x4xbf16>) {
                    %extracted_slice_23 = tensor.extract_slice %pack_19[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                    %extracted_slice_24 = tensor.extract_slice %pack_21[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                    %extracted_slice_25 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                    %28 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                    %29 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                    %30 = vector.transfer_read %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                    %31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %29, %30 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                    %32 = vector.transfer_write %31, %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                    %inserted_slice = tensor.insert_slice %32 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                    scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                  }
                  scf.yield %27 : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %26 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %25 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %24 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x16x16x4x4xbf16>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 192] [128, 64] [1, 1] : tensor<128x256xbf16> to tensor<128x64xbf16>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x64x64xbf16, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %13 : tensor<128x64xbf16> -> tensor<2x1x64x64xbf16>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[192, 0] [64, 128] [1, 1] : tensor<256x128xbf16> to tensor<64x128xbf16>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x64x64xbf16, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x128xbf16> -> tensor<1x2x64x64xbf16>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x16x16x4x4xbf16>, tensor<2x2x64x64xbf16>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x1x64x64xbf16> to tensor<1x1x64x64xbf16>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x64x64xbf16> -> tensor<1x1x8x16x4x8xbf16>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x64x64xbf16> -> tensor<1x1x16x8x8x4xbf16>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x16x16x4x4xbf16> to tensor<1x1x16x16x4x4xbf16>
      %18 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_18) -> (tensor<1x1x16x16x4x4xbf16>) {
        %19 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x16x16x4x4xbf16>) {
          %20 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x16x16x4x4xbf16>) {
            %21 = scf.for %arg13 = %c0 to %c16 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x16x16x4x4xbf16>) {
              %22 = scf.for %arg15 = %c0 to %c16 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x16x16x4x4xbf16>) {
                %23 = scf.for %arg17 = %c0 to %c8 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x16x16x4x4xbf16>) {
                  %extracted_slice_21 = tensor.extract_slice %pack_15[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x16x4x8xbf16> to tensor<1x1x1x1x4x8xbf16>
                  %extracted_slice_22 = tensor.extract_slice %pack_17[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x8x8x4xbf16> to tensor<1x1x1x1x8x4xbf16>
                  %extracted_slice_23 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> to tensor<1x1x1x1x4x4xbf16>
                  %24 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16>
                  %25 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16>
                  %26 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xbf16>, vector<1x1x1x1x4x4xbf16>
                  %27 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %24, %25, %26 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  %28 = vector.transfer_write %27, %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, tensor<1x1x1x1x4x4xbf16>
                  %inserted_slice = tensor.insert_slice %28 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xbf16> into tensor<1x1x16x16x4x4xbf16>
                  scf.yield %inserted_slice : tensor<1x1x16x16x4x4xbf16>
                }
                scf.yield %23 : tensor<1x1x16x16x4x4xbf16>
              }
              scf.yield %22 : tensor<1x1x16x16x4x4xbf16>
            }
            scf.yield %21 : tensor<1x1x16x16x4x4xbf16>
          }
          scf.yield %20 : tensor<1x1x16x16x4x4xbf16>
        }
        scf.yield %19 : tensor<1x1x16x16x4x4xbf16>
      }
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<2x2x64x64xbf16> to tensor<1x1x64x64xbf16>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x16x16x4x4xbf16> -> tensor<1x1x64x64xbf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : tensor<1x1x64x64xbf16> into tensor<2x2x64x64xbf16>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x16x16x4x4xbf16> into tensor<2x2x16x16x4x4xbf16>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_6 : tensor<2x2x64x64xbf16> -> tensor<128x128xbf16>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [128, 128] [1, 1] : tensor<128x128xbf16> into tensor<128x128xbf16>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xbf16> -> !flow.dispatch.tensor<writeonly:tensor<128x128xbf16>>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
    %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%cst : bf16) outs(%subview_14 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c16 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_16 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_17 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_18 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %10 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                  %11 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                  %12 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                  %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %10, %11, %12 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  vector.transfer_write %13, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %subview_19 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_19 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: bf16, %out: bf16):
                    linalg.yield %in : bf16
                  }
                  scf.yield %arg15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %3 = scf.for %arg2 = %c1 to %c3 step %c1 iter_args(%arg3 = %alloc_3) -> (memref<2x2x16x16x4x4xbf16, 2 : i32>) {
      %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
      %subview_12 = memref.subview %subview[0, %4] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_13 = memref.subview %subview_5[%4, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg4, %arg5) in (2, 2) {
        %subview_14 = memref.subview %alloc_2[%arg4, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_15 = memref.subview %alloc_1[0, %arg5, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_16 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %subview_16) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %10 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                    %subview_18 = memref.subview %alloc_0[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                    %subview_19 = memref.subview %alloc[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                    %subview_20 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                    %11 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                    %12 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                    %13 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %13 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                    vector.transfer_write %14, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                    %subview_21 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                    ^bb0(%in: bf16, %out: bf16):
                      linalg.yield %in : bf16
                    }
                    scf.yield %arg17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  }
                  scf.yield %10 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        }
        %subview_17 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: bf16, %out: bf16):
          linalg.yield %in : bf16
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %arg3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    }
    %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_14 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c16 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_18 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_19 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_20 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %10 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                  %11 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                  %12 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                  %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %10, %11, %12 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  vector.transfer_write %13, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %subview_21 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: bf16, %out: bf16):
                    linalg.yield %in : bf16
                  }
                  scf.yield %arg15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      %subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_16 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
      %subview_17 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_11 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_11 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: bf16, %out: bf16):
      linalg.yield %in : bf16
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: bf16, %out: bf16):
    linalg.yield %in : bf16
  }
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
    %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%cst : bf16) outs(%subview_14 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c16 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_16 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_17 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_18 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %10 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                  %11 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                  %12 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                  %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %10, %11, %12 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  vector.transfer_write %13, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %subview_19 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_19 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: bf16, %out: bf16):
                    linalg.yield %in : bf16
                  }
                  scf.yield %arg15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %3 = scf.for %arg2 = %c1 to %c3 step %c1 iter_args(%arg3 = %alloc_3) -> (memref<2x2x16x16x4x4xbf16, 2 : i32>) {
      %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
      %subview_12 = memref.subview %subview[0, %4] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_13 = memref.subview %subview_5[%4, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg4, %arg5) in (2, 2) {
        %subview_14 = memref.subview %alloc_2[%arg4, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_15 = memref.subview %alloc_1[0, %arg5, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_16 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %subview_16) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c16 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %10 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                    %subview_18 = memref.subview %alloc_0[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                    %subview_19 = memref.subview %alloc[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                    %subview_20 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                    %11 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                    %12 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                    %13 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                    %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %12, %13 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                    vector.transfer_write %14, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                    %subview_21 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                    ^bb0(%in: bf16, %out: bf16):
                      linalg.yield %in : bf16
                    }
                    scf.yield %arg17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  }
                  scf.yield %10 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        }
        %subview_17 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: bf16, %out: bf16):
          linalg.yield %in : bf16
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %arg3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    }
    %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_14 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c16 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c16 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_18 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_19 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_20 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %10 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                  %11 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                  %12 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                  %13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %10, %11, %12 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  vector.transfer_write %13, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  %subview_21 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: bf16, %out: bf16):
                    linalg.yield %in : bf16
                  }
                  scf.yield %arg15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      %subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_16 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
      %subview_17 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_11 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_11 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: bf16, %out: bf16):
      linalg.yield %in : bf16
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: bf16, %out: bf16):
    linalg.yield %in : bf16
  }
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
    %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%cst : bf16) outs(%subview_14 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_16 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_18 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %subview_19 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_19 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: bf16, %out: bf16):
              linalg.yield %in : bf16
            }
          }
        }
      }
      %subview_15 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_14 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c3 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
      %subview_12 = memref.subview %subview[0, %3] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_13 = memref.subview %subview_5[%3, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_14 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_15 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_16 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c16 step %c1 {
            scf.for %arg7 = %c0 to %c8 step %c1 {
              %subview_18 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_19 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_20 = memref.subview %subview_16[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %4 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %5 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %6 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %4, %5, %6 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %7, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %subview_21 = memref.subview %subview_16[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              ^bb0(%in: bf16, %out: bf16):
                linalg.yield %in : bf16
              }
            }
          }
        }
        %subview_17 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_16 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: bf16, %out: bf16):
          linalg.yield %in : bf16
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_18 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_19 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_20 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %subview_21 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: bf16, %out: bf16):
              linalg.yield %in : bf16
            }
          }
        }
      }
      %subview_15 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      %subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_16 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
      %subview_17 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_14 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_11 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_11 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: bf16, %out: bf16):
      linalg.yield %in : bf16
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
    %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%cst : bf16) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_16[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_16 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: bf16, %out: bf16):
              linalg.yield %in : bf16
            }
          }
        }
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c3 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c16 step %c1 {
            scf.for %arg7 = %c0 to %c8 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %6 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %4, %5, %6 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %7, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
              ^bb0(%in: bf16, %out: bf16):
                linalg.yield %in : bf16
              }
            }
          }
        }
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: bf16, %out: bf16):
          linalg.yield %in : bf16
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_17[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_17 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: bf16, %out: bf16):
              linalg.yield %in : bf16
            }
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_14 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_14 : memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: bf16, %out: bf16):
        linalg.yield %in : bf16
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: bf16, %out: bf16):
      linalg.yield %in : bf16
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
    %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%cst : bf16) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_16[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
        }
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c3 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c16 step %c1 {
            scf.for %arg7 = %c0 to %c8 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %6 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %4, %5, %6 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %7, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_17[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before HoistStaticallyBoundAllocations (iree-hoist-statically-bound-allocations) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c3 = arith.constant 3 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
    %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%cst : bf16) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_16[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
        }
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c3 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c16 step %c1 {
            scf.for %arg7 = %c0 to %c8 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %6 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %4, %5, %6 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %7, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
    %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c16 step %c1 {
        scf.for %arg5 = %c0 to %c16 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            %3 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %6, %subview_17[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  return
}

// -----// IR Dump Before LowerUKernelOpsToCalls (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
      %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %3 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %subview_16[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c3 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %subview_11 = memref.subview %subview[0, %3] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
        %subview_12 = memref.subview %subview_5[%3, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
          %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
          %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %6 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %4, %5, %6 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %7, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %3 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %subview_17[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
        %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
      %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %3 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %subview_16[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c3 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %subview_11 = memref.subview %subview[0, %3] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
        %subview_12 = memref.subview %subview_5[%3, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
          %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
          %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %6 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %4, %5, %6 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %7, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %3 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %subview_17[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
        %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    memref.assume_alignment %0, 64 : memref<128x256xbf16>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    memref.assume_alignment %1, 64 : memref<256x128xbf16>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    memref.assume_alignment %2, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
      %subview = memref.subview %0[%arg0, 0] [128, 256] [1, 1] : memref<128x256xbf16> to memref<128x256xbf16, strided<[256, 1], offset: ?>>
      %subview_5 = memref.subview %1[0, %arg1] [256, 128] [1, 1] : memref<256x128xbf16> to memref<256x128xbf16, strided<[128, 1], offset: ?>>
      %subview_6 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16> to memref<128x128xbf16, strided<[128, 1], offset: ?>>
      %subview_7 = memref.subview %subview[0, 0] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_8 = memref.subview %subview_5[0, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview_13 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %3 = vector.transfer_read %subview_14[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %subview_16[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c3 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %subview_11 = memref.subview %subview[0, %3] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
        iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>> memref<2x1x64x64xbf16, 1 : i32>)
        %subview_12 = memref.subview %subview_5[%3, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>> memref<1x2x64x64xbf16, 1 : i32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
          %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
          %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
                %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %6 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %4, %5, %6 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %7, %subview_18[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_9 = memref.subview %subview[0, 192] [128, 64] [1, 1] : memref<128x256xbf16, strided<[256, 1], offset: ?>> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_10 = memref.subview %subview_5[192, 0] [64, 128] [1, 1] : memref<256x128xbf16, strided<[128, 1], offset: ?>> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x8x16x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x8x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
              %3 = vector.transfer_read %subview_15[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[4096, 4096, 512, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %subview_16[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[4096, 4096, 256, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %subview_17[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x1x1x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
            }
          }
        }
        %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_6 : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIEPackToDma (iree-amdaie-pack-to-dma) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    memref.assume_alignment %0, 64 : memref<128x256xbf16>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    memref.assume_alignment %1, 64 : memref<256x128xbf16>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    memref.assume_alignment %2, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
      %subview = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16> to memref<128x128xbf16, strided<[128, 1], offset: ?>>
      %subview_5 = memref.subview %0[%arg0, 0] [128, 64] [1, 1] : memref<128x256xbf16> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_5 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_6 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<256x128xbf16> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_9 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_10 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_11 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview_11 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %3 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c3 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %subview_9 = memref.subview %0[%arg0, %3] [128, 64] [1, 1] : memref<128x256xbf16> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
        iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>> memref<2x1x64x64xbf16, 1 : i32>)
        %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %subview_10 = memref.subview %1[%4, %arg1] [64, 128] [1, 1] : memref<256x128xbf16> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>> memref<1x2x64x64xbf16, 1 : i32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_11 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
          %subview_12 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %5 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %6 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %7 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %8, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_7 = memref.subview %0[%arg0, 192] [128, 64] [1, 1] : memref<128x256xbf16> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_2 : (memref<128x64xbf16, strided<[256, 1], offset: ?>> memref<2x1x64x64xbf16, 1 : i32>)
      %subview_8 = memref.subview %1[192, %arg1] [64, 128] [1, 1] : memref<256x128xbf16> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %alloc_1 : (memref<64x128xbf16, strided<[128, 1], offset: ?>> memref<1x2x64x64xbf16, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_9 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x16x4x8xbf16, 2 : i32>)
        %subview_10 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32> memref<1x1x16x8x8x4xbf16, 2 : i32>)
        %subview_11 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %3 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        %subview_12 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.unpack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_12 : (memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview : (memref<2x2x64x64xbf16, 1 : i32> memref<128x128xbf16, strided<[128, 1], offset: ?>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before CopyToDma (air-copy-to-dma) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    memref.assume_alignment %0, 64 : memref<128x256xbf16>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    memref.assume_alignment %1, 64 : memref<256x128xbf16>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    memref.assume_alignment %2, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
      %subview = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<128x128xbf16> to memref<128x128xbf16, strided<[128, 1], offset: ?>>
      %subview_5 = memref.subview %0[%arg0, 0] [128, 64] [1, 1] : memref<128x256xbf16> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
      %c0_6 = arith.constant 0 : index
      %c0_7 = arith.constant 0 : index
      %c0_8 = arith.constant 0 : index
      %c0_9 = arith.constant 0 : index
      %c2 = arith.constant 2 : index
      %c1_10 = arith.constant 1 : index
      %c64 = arith.constant 64 : index
      %c64_11 = arith.constant 64 : index
      %c4096 = arith.constant 4096 : index
      %c4096_12 = arith.constant 4096 : index
      %c64_13 = arith.constant 64 : index
      %c1_14 = arith.constant 1 : index
      %c0_15 = arith.constant 0 : index
      %c0_16 = arith.constant 0 : index
      %c0_17 = arith.constant 0 : index
      %c2_18 = arith.constant 2 : index
      %c1_19 = arith.constant 1 : index
      %c64_20 = arith.constant 64 : index
      %c64_21 = arith.constant 64 : index
      %c16384 = arith.constant 16384 : index
      %c64_22 = arith.constant 64 : index
      %c256 = arith.constant 256 : index
      %c1_23 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_2[%c0_6, %c0_7, %c0_8, %c0_9] [%c2, %c1_10, %c64, %c64_11] [%c4096, %c4096_12, %c64_13, %c1_14], %0[%c0_15, %c0_16, %arg0, %c0_17] [%c2_18, %c1_19, %c64_20, %c64_21] [%c16384, %c64_22, %c256, %c1_23]) : (memref<2x1x64x64xbf16, 1 : i32>, memref<128x256xbf16>)
      %subview_24 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<256x128xbf16> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
      %c0_25 = arith.constant 0 : index
      %c0_26 = arith.constant 0 : index
      %c0_27 = arith.constant 0 : index
      %c0_28 = arith.constant 0 : index
      %c1_29 = arith.constant 1 : index
      %c2_30 = arith.constant 2 : index
      %c64_31 = arith.constant 64 : index
      %c64_32 = arith.constant 64 : index
      %c8192 = arith.constant 8192 : index
      %c4096_33 = arith.constant 4096 : index
      %c64_34 = arith.constant 64 : index
      %c1_35 = arith.constant 1 : index
      %c0_36 = arith.constant 0 : index
      %c0_37 = arith.constant 0 : index
      %c0_38 = arith.constant 0 : index
      %c1_39 = arith.constant 1 : index
      %c2_40 = arith.constant 2 : index
      %c64_41 = arith.constant 64 : index
      %c64_42 = arith.constant 64 : index
      %c8192_43 = arith.constant 8192 : index
      %c64_44 = arith.constant 64 : index
      %c128 = arith.constant 128 : index
      %c1_45 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_1[%c0_25, %c0_26, %c0_27, %c0_28] [%c1_29, %c2_30, %c64_31, %c64_32] [%c8192, %c4096_33, %c64_34, %c1_35], %1[%c0_36, %c0_37, %c0_38, %arg1] [%c1_39, %c2_40, %c64_41, %c64_42] [%c8192_43, %c64_44, %c128, %c1_45]) : (memref<1x2x64x64xbf16, 1 : i32>, memref<256x128xbf16>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_109 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %c0_110 = arith.constant 0 : index
        %c0_111 = arith.constant 0 : index
        %c0_112 = arith.constant 0 : index
        %c0_113 = arith.constant 0 : index
        %c0_114 = arith.constant 0 : index
        %c0_115 = arith.constant 0 : index
        %c1_116 = arith.constant 1 : index
        %c1_117 = arith.constant 1 : index
        %c8_118 = arith.constant 8 : index
        %c16_119 = arith.constant 16 : index
        %c4 = arith.constant 4 : index
        %c8_120 = arith.constant 8 : index
        %c4096_121 = arith.constant 4096 : index
        %c4096_122 = arith.constant 4096 : index
        %c512 = arith.constant 512 : index
        %c32 = arith.constant 32 : index
        %c8_123 = arith.constant 8 : index
        %c1_124 = arith.constant 1 : index
        %c0_125 = arith.constant 0 : index
        %c0_126 = arith.constant 0 : index
        %c0_127 = arith.constant 0 : index
        %c0_128 = arith.constant 0 : index
        %c0_129 = arith.constant 0 : index
        %c1_130 = arith.constant 1 : index
        %c1_131 = arith.constant 1 : index
        %c8_132 = arith.constant 8 : index
        %c16_133 = arith.constant 16 : index
        %c4_134 = arith.constant 4 : index
        %c8_135 = arith.constant 8 : index
        %c4096_136 = arith.constant 4096 : index
        %c4096_137 = arith.constant 4096 : index
        %c8_138 = arith.constant 8 : index
        %c256_139 = arith.constant 256 : index
        %c64_140 = arith.constant 64 : index
        %c1_141 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_0[%c0_110, %c0_111, %c0_112, %c0_113, %c0_114, %c0_115] [%c1_116, %c1_117, %c8_118, %c16_119, %c4, %c8_120] [%c4096_121, %c4096_122, %c512, %c32, %c8_123, %c1_124], %alloc_2[%arg2, %c0_125, %c0_126, %c0_127, %c0_128, %c0_129] [%c1_130, %c1_131, %c8_132, %c16_133, %c4_134, %c8_135] [%c4096_136, %c4096_137, %c8_138, %c256_139, %c64_140, %c1_141]) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<2x1x64x64xbf16, 1 : i32>)
        %subview_142 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        %c0_143 = arith.constant 0 : index
        %c0_144 = arith.constant 0 : index
        %c0_145 = arith.constant 0 : index
        %c0_146 = arith.constant 0 : index
        %c0_147 = arith.constant 0 : index
        %c0_148 = arith.constant 0 : index
        %c1_149 = arith.constant 1 : index
        %c1_150 = arith.constant 1 : index
        %c16_151 = arith.constant 16 : index
        %c8_152 = arith.constant 8 : index
        %c8_153 = arith.constant 8 : index
        %c4_154 = arith.constant 4 : index
        %c4096_155 = arith.constant 4096 : index
        %c4096_156 = arith.constant 4096 : index
        %c256_157 = arith.constant 256 : index
        %c32_158 = arith.constant 32 : index
        %c4_159 = arith.constant 4 : index
        %c1_160 = arith.constant 1 : index
        %c0_161 = arith.constant 0 : index
        %c0_162 = arith.constant 0 : index
        %c0_163 = arith.constant 0 : index
        %c0_164 = arith.constant 0 : index
        %c0_165 = arith.constant 0 : index
        %c1_166 = arith.constant 1 : index
        %c1_167 = arith.constant 1 : index
        %c16_168 = arith.constant 16 : index
        %c8_169 = arith.constant 8 : index
        %c8_170 = arith.constant 8 : index
        %c4_171 = arith.constant 4 : index
        %c8192_172 = arith.constant 8192 : index
        %c4096_173 = arith.constant 4096 : index
        %c4_174 = arith.constant 4 : index
        %c512_175 = arith.constant 512 : index
        %c64_176 = arith.constant 64 : index
        %c1_177 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc[%c0_143, %c0_144, %c0_145, %c0_146, %c0_147, %c0_148] [%c1_149, %c1_150, %c16_151, %c8_152, %c8_153, %c4_154] [%c4096_155, %c4096_156, %c256_157, %c32_158, %c4_159, %c1_160], %alloc_1[%c0_161, %arg3, %c0_162, %c0_163, %c0_164, %c0_165] [%c1_166, %c1_167, %c16_168, %c8_169, %c8_170, %c4_171] [%c8192_172, %c4096_173, %c4_174, %c512_175, %c64_176, %c1_177]) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x2x64x64xbf16, 1 : i32>)
        %subview_178 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview_178 : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %3 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c3 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %subview_109 = memref.subview %0[%arg0, %3] [128, 64] [1, 1] : memref<128x256xbf16> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
        %c0_110 = arith.constant 0 : index
        %c0_111 = arith.constant 0 : index
        %c0_112 = arith.constant 0 : index
        %c0_113 = arith.constant 0 : index
        %c2_114 = arith.constant 2 : index
        %c1_115 = arith.constant 1 : index
        %c64_116 = arith.constant 64 : index
        %c64_117 = arith.constant 64 : index
        %c4096_118 = arith.constant 4096 : index
        %c4096_119 = arith.constant 4096 : index
        %c64_120 = arith.constant 64 : index
        %c1_121 = arith.constant 1 : index
        %c0_122 = arith.constant 0 : index
        %c0_123 = arith.constant 0 : index
        %c2_124 = arith.constant 2 : index
        %c1_125 = arith.constant 1 : index
        %c64_126 = arith.constant 64 : index
        %c64_127 = arith.constant 64 : index
        %c16384_128 = arith.constant 16384 : index
        %c64_129 = arith.constant 64 : index
        %c256_130 = arith.constant 256 : index
        %c1_131 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_2[%c0_110, %c0_111, %c0_112, %c0_113] [%c2_114, %c1_115, %c64_116, %c64_117] [%c4096_118, %c4096_119, %c64_120, %c1_121], %0[%c0_122, %c0_123, %arg0, %3] [%c2_124, %c1_125, %c64_126, %c64_127] [%c16384_128, %c64_129, %c256_130, %c1_131]) : (memref<2x1x64x64xbf16, 1 : i32>, memref<128x256xbf16>)
        %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %subview_132 = memref.subview %1[%4, %arg1] [64, 128] [1, 1] : memref<256x128xbf16> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
        %c0_133 = arith.constant 0 : index
        %c0_134 = arith.constant 0 : index
        %c0_135 = arith.constant 0 : index
        %c0_136 = arith.constant 0 : index
        %c1_137 = arith.constant 1 : index
        %c2_138 = arith.constant 2 : index
        %c64_139 = arith.constant 64 : index
        %c64_140 = arith.constant 64 : index
        %c8192_141 = arith.constant 8192 : index
        %c4096_142 = arith.constant 4096 : index
        %c64_143 = arith.constant 64 : index
        %c1_144 = arith.constant 1 : index
        %c0_145 = arith.constant 0 : index
        %c0_146 = arith.constant 0 : index
        %c1_147 = arith.constant 1 : index
        %c2_148 = arith.constant 2 : index
        %c64_149 = arith.constant 64 : index
        %c64_150 = arith.constant 64 : index
        %c8192_151 = arith.constant 8192 : index
        %c64_152 = arith.constant 64 : index
        %c128_153 = arith.constant 128 : index
        %c1_154 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_1[%c0_133, %c0_134, %c0_135, %c0_136] [%c1_137, %c2_138, %c64_139, %c64_140] [%c8192_141, %c4096_142, %c64_143, %c1_144], %1[%c0_145, %c0_146, %4, %arg1] [%c1_147, %c2_148, %c64_149, %c64_150] [%c8192_151, %c64_152, %c128_153, %c1_154]) : (memref<1x2x64x64xbf16, 1 : i32>, memref<256x128xbf16>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_155 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %c0_156 = arith.constant 0 : index
          %c0_157 = arith.constant 0 : index
          %c0_158 = arith.constant 0 : index
          %c0_159 = arith.constant 0 : index
          %c0_160 = arith.constant 0 : index
          %c0_161 = arith.constant 0 : index
          %c1_162 = arith.constant 1 : index
          %c1_163 = arith.constant 1 : index
          %c8_164 = arith.constant 8 : index
          %c16_165 = arith.constant 16 : index
          %c4 = arith.constant 4 : index
          %c8_166 = arith.constant 8 : index
          %c4096_167 = arith.constant 4096 : index
          %c4096_168 = arith.constant 4096 : index
          %c512 = arith.constant 512 : index
          %c32 = arith.constant 32 : index
          %c8_169 = arith.constant 8 : index
          %c1_170 = arith.constant 1 : index
          %c0_171 = arith.constant 0 : index
          %c0_172 = arith.constant 0 : index
          %c0_173 = arith.constant 0 : index
          %c0_174 = arith.constant 0 : index
          %c0_175 = arith.constant 0 : index
          %c1_176 = arith.constant 1 : index
          %c1_177 = arith.constant 1 : index
          %c8_178 = arith.constant 8 : index
          %c16_179 = arith.constant 16 : index
          %c4_180 = arith.constant 4 : index
          %c8_181 = arith.constant 8 : index
          %c4096_182 = arith.constant 4096 : index
          %c4096_183 = arith.constant 4096 : index
          %c8_184 = arith.constant 8 : index
          %c256_185 = arith.constant 256 : index
          %c64_186 = arith.constant 64 : index
          %c1_187 = arith.constant 1 : index
          air.dma_memcpy_nd (%alloc_0[%c0_156, %c0_157, %c0_158, %c0_159, %c0_160, %c0_161] [%c1_162, %c1_163, %c8_164, %c16_165, %c4, %c8_166] [%c4096_167, %c4096_168, %c512, %c32, %c8_169, %c1_170], %alloc_2[%arg3, %c0_171, %c0_172, %c0_173, %c0_174, %c0_175] [%c1_176, %c1_177, %c8_178, %c16_179, %c4_180, %c8_181] [%c4096_182, %c4096_183, %c8_184, %c256_185, %c64_186, %c1_187]) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<2x1x64x64xbf16, 1 : i32>)
          %subview_188 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
          %c0_189 = arith.constant 0 : index
          %c0_190 = arith.constant 0 : index
          %c0_191 = arith.constant 0 : index
          %c0_192 = arith.constant 0 : index
          %c0_193 = arith.constant 0 : index
          %c0_194 = arith.constant 0 : index
          %c1_195 = arith.constant 1 : index
          %c1_196 = arith.constant 1 : index
          %c16_197 = arith.constant 16 : index
          %c8_198 = arith.constant 8 : index
          %c8_199 = arith.constant 8 : index
          %c4_200 = arith.constant 4 : index
          %c4096_201 = arith.constant 4096 : index
          %c4096_202 = arith.constant 4096 : index
          %c256_203 = arith.constant 256 : index
          %c32_204 = arith.constant 32 : index
          %c4_205 = arith.constant 4 : index
          %c1_206 = arith.constant 1 : index
          %c0_207 = arith.constant 0 : index
          %c0_208 = arith.constant 0 : index
          %c0_209 = arith.constant 0 : index
          %c0_210 = arith.constant 0 : index
          %c0_211 = arith.constant 0 : index
          %c1_212 = arith.constant 1 : index
          %c1_213 = arith.constant 1 : index
          %c16_214 = arith.constant 16 : index
          %c8_215 = arith.constant 8 : index
          %c8_216 = arith.constant 8 : index
          %c4_217 = arith.constant 4 : index
          %c8192_218 = arith.constant 8192 : index
          %c4096_219 = arith.constant 4096 : index
          %c4_220 = arith.constant 4 : index
          %c512_221 = arith.constant 512 : index
          %c64_222 = arith.constant 64 : index
          %c1_223 = arith.constant 1 : index
          air.dma_memcpy_nd (%alloc[%c0_189, %c0_190, %c0_191, %c0_192, %c0_193, %c0_194] [%c1_195, %c1_196, %c16_197, %c8_198, %c8_199, %c4_200] [%c4096_201, %c4096_202, %c256_203, %c32_204, %c4_205, %c1_206], %alloc_1[%c0_207, %arg4, %c0_208, %c0_209, %c0_210, %c0_211] [%c1_212, %c1_213, %c16_214, %c8_215, %c8_216, %c4_217] [%c8192_218, %c4096_219, %c4_220, %c512_221, %c64_222, %c1_223]) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x2x64x64xbf16, 1 : i32>)
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %5 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %6 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %7 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %8, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_46 = memref.subview %0[%arg0, 192] [128, 64] [1, 1] : memref<128x256xbf16> to memref<128x64xbf16, strided<[256, 1], offset: ?>>
      %c0_47 = arith.constant 0 : index
      %c0_48 = arith.constant 0 : index
      %c0_49 = arith.constant 0 : index
      %c0_50 = arith.constant 0 : index
      %c2_51 = arith.constant 2 : index
      %c1_52 = arith.constant 1 : index
      %c64_53 = arith.constant 64 : index
      %c64_54 = arith.constant 64 : index
      %c4096_55 = arith.constant 4096 : index
      %c4096_56 = arith.constant 4096 : index
      %c64_57 = arith.constant 64 : index
      %c1_58 = arith.constant 1 : index
      %c0_59 = arith.constant 0 : index
      %c0_60 = arith.constant 0 : index
      %c192 = arith.constant 192 : index
      %c2_61 = arith.constant 2 : index
      %c1_62 = arith.constant 1 : index
      %c64_63 = arith.constant 64 : index
      %c64_64 = arith.constant 64 : index
      %c16384_65 = arith.constant 16384 : index
      %c64_66 = arith.constant 64 : index
      %c256_67 = arith.constant 256 : index
      %c1_68 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_2[%c0_47, %c0_48, %c0_49, %c0_50] [%c2_51, %c1_52, %c64_53, %c64_54] [%c4096_55, %c4096_56, %c64_57, %c1_58], %0[%c0_59, %c0_60, %arg0, %c192] [%c2_61, %c1_62, %c64_63, %c64_64] [%c16384_65, %c64_66, %c256_67, %c1_68]) : (memref<2x1x64x64xbf16, 1 : i32>, memref<128x256xbf16>)
      %subview_69 = memref.subview %1[192, %arg1] [64, 128] [1, 1] : memref<256x128xbf16> to memref<64x128xbf16, strided<[128, 1], offset: ?>>
      %c0_70 = arith.constant 0 : index
      %c0_71 = arith.constant 0 : index
      %c0_72 = arith.constant 0 : index
      %c0_73 = arith.constant 0 : index
      %c1_74 = arith.constant 1 : index
      %c2_75 = arith.constant 2 : index
      %c64_76 = arith.constant 64 : index
      %c64_77 = arith.constant 64 : index
      %c8192_78 = arith.constant 8192 : index
      %c4096_79 = arith.constant 4096 : index
      %c64_80 = arith.constant 64 : index
      %c1_81 = arith.constant 1 : index
      %c0_82 = arith.constant 0 : index
      %c0_83 = arith.constant 0 : index
      %c192_84 = arith.constant 192 : index
      %c1_85 = arith.constant 1 : index
      %c2_86 = arith.constant 2 : index
      %c64_87 = arith.constant 64 : index
      %c64_88 = arith.constant 64 : index
      %c8192_89 = arith.constant 8192 : index
      %c64_90 = arith.constant 64 : index
      %c128_91 = arith.constant 128 : index
      %c1_92 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_1[%c0_70, %c0_71, %c0_72, %c0_73] [%c1_74, %c2_75, %c64_76, %c64_77] [%c8192_78, %c4096_79, %c64_80, %c1_81], %1[%c0_82, %c0_83, %c192_84, %arg1] [%c1_85, %c2_86, %c64_87, %c64_88] [%c8192_89, %c64_90, %c128_91, %c1_92]) : (memref<1x2x64x64xbf16, 1 : i32>, memref<256x128xbf16>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_109 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x1x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %c0_110 = arith.constant 0 : index
        %c0_111 = arith.constant 0 : index
        %c0_112 = arith.constant 0 : index
        %c0_113 = arith.constant 0 : index
        %c0_114 = arith.constant 0 : index
        %c0_115 = arith.constant 0 : index
        %c1_116 = arith.constant 1 : index
        %c1_117 = arith.constant 1 : index
        %c8_118 = arith.constant 8 : index
        %c16_119 = arith.constant 16 : index
        %c4 = arith.constant 4 : index
        %c8_120 = arith.constant 8 : index
        %c4096_121 = arith.constant 4096 : index
        %c4096_122 = arith.constant 4096 : index
        %c512 = arith.constant 512 : index
        %c32 = arith.constant 32 : index
        %c8_123 = arith.constant 8 : index
        %c1_124 = arith.constant 1 : index
        %c0_125 = arith.constant 0 : index
        %c0_126 = arith.constant 0 : index
        %c0_127 = arith.constant 0 : index
        %c0_128 = arith.constant 0 : index
        %c0_129 = arith.constant 0 : index
        %c1_130 = arith.constant 1 : index
        %c1_131 = arith.constant 1 : index
        %c8_132 = arith.constant 8 : index
        %c16_133 = arith.constant 16 : index
        %c4_134 = arith.constant 4 : index
        %c8_135 = arith.constant 8 : index
        %c4096_136 = arith.constant 4096 : index
        %c4096_137 = arith.constant 4096 : index
        %c8_138 = arith.constant 8 : index
        %c256_139 = arith.constant 256 : index
        %c64_140 = arith.constant 64 : index
        %c1_141 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_0[%c0_110, %c0_111, %c0_112, %c0_113, %c0_114, %c0_115] [%c1_116, %c1_117, %c8_118, %c16_119, %c4, %c8_120] [%c4096_121, %c4096_122, %c512, %c32, %c8_123, %c1_124], %alloc_2[%arg2, %c0_125, %c0_126, %c0_127, %c0_128, %c0_129] [%c1_130, %c1_131, %c8_132, %c16_133, %c4_134, %c8_135] [%c4096_136, %c4096_137, %c8_138, %c256_139, %c64_140, %c1_141]) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<2x1x64x64xbf16, 1 : i32>)
        %subview_142 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        %c0_143 = arith.constant 0 : index
        %c0_144 = arith.constant 0 : index
        %c0_145 = arith.constant 0 : index
        %c0_146 = arith.constant 0 : index
        %c0_147 = arith.constant 0 : index
        %c0_148 = arith.constant 0 : index
        %c1_149 = arith.constant 1 : index
        %c1_150 = arith.constant 1 : index
        %c16_151 = arith.constant 16 : index
        %c8_152 = arith.constant 8 : index
        %c8_153 = arith.constant 8 : index
        %c4_154 = arith.constant 4 : index
        %c4096_155 = arith.constant 4096 : index
        %c4096_156 = arith.constant 4096 : index
        %c256_157 = arith.constant 256 : index
        %c32_158 = arith.constant 32 : index
        %c4_159 = arith.constant 4 : index
        %c1_160 = arith.constant 1 : index
        %c0_161 = arith.constant 0 : index
        %c0_162 = arith.constant 0 : index
        %c0_163 = arith.constant 0 : index
        %c0_164 = arith.constant 0 : index
        %c0_165 = arith.constant 0 : index
        %c1_166 = arith.constant 1 : index
        %c1_167 = arith.constant 1 : index
        %c16_168 = arith.constant 16 : index
        %c8_169 = arith.constant 8 : index
        %c8_170 = arith.constant 8 : index
        %c4_171 = arith.constant 4 : index
        %c8192_172 = arith.constant 8192 : index
        %c4096_173 = arith.constant 4096 : index
        %c4_174 = arith.constant 4 : index
        %c512_175 = arith.constant 512 : index
        %c64_176 = arith.constant 64 : index
        %c1_177 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc[%c0_143, %c0_144, %c0_145, %c0_146, %c0_147, %c0_148] [%c1_149, %c1_150, %c16_151, %c8_152, %c8_153, %c4_154] [%c4096_155, %c4096_156, %c256_157, %c32_158, %c4_159, %c1_160], %alloc_1[%c0_161, %arg3, %c0_162, %c0_163, %c0_164, %c0_165] [%c1_166, %c1_167, %c16_168, %c8_169, %c8_170, %c4_171] [%c8192_172, %c4096_173, %c4_174, %c512_175, %c64_176, %c1_177]) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x2x64x64xbf16, 1 : i32>)
        %subview_178 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %3 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        %subview_179 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xbf16, 1 : i32> to memref<1x1x64x64xbf16, strided<[8192, 4096, 64, 1], offset: ?>, 1 : i32>
        %c0_180 = arith.constant 0 : index
        %c0_181 = arith.constant 0 : index
        %c1_182 = arith.constant 1 : index
        %c1_183 = arith.constant 1 : index
        %c64_184 = arith.constant 64 : index
        %c64_185 = arith.constant 64 : index
        %c8192_186 = arith.constant 8192 : index
        %c4096_187 = arith.constant 4096 : index
        %c64_188 = arith.constant 64 : index
        %c1_189 = arith.constant 1 : index
        %c0_190 = arith.constant 0 : index
        %c0_191 = arith.constant 0 : index
        %c0_192 = arith.constant 0 : index
        %c0_193 = arith.constant 0 : index
        %c1_194 = arith.constant 1 : index
        %c1_195 = arith.constant 1 : index
        %c16_196 = arith.constant 16 : index
        %c4_197 = arith.constant 4 : index
        %c16_198 = arith.constant 16 : index
        %c4_199 = arith.constant 4 : index
        %c8192_200 = arith.constant 8192 : index
        %c4096_201 = arith.constant 4096 : index
        %c16_202 = arith.constant 16 : index
        %c4_203 = arith.constant 4 : index
        %c256_204 = arith.constant 256 : index
        %c1_205 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_4[%arg2, %arg3, %c0_180, %c0_181] [%c1_182, %c1_183, %c64_184, %c64_185] [%c8192_186, %c4096_187, %c64_188, %c1_189], %alloc_3[%arg2, %arg3, %c0_190, %c0_191, %c0_192, %c0_193] [%c1_194, %c1_195, %c16_196, %c4_197, %c16_198, %c4_199] [%c8192_200, %c4096_201, %c16_202, %c4_203, %c256_204, %c1_205]) : (memref<2x2x64x64xbf16, 1 : i32>, memref<2x2x16x16x4x4xbf16, 2 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c128_93 = arith.constant 128 : index
      %c128_94 = arith.constant 128 : index
      %c128_95 = arith.constant 128 : index
      %c1_96 = arith.constant 1 : index
      %c0_97 = arith.constant 0 : index
      %c0_98 = arith.constant 0 : index
      %c0_99 = arith.constant 0 : index
      %c0_100 = arith.constant 0 : index
      %c2_101 = arith.constant 2 : index
      %c64_102 = arith.constant 64 : index
      %c2_103 = arith.constant 2 : index
      %c64_104 = arith.constant 64 : index
      %c8192_105 = arith.constant 8192 : index
      %c64_106 = arith.constant 64 : index
      %c4096_107 = arith.constant 4096 : index
      %c1_108 = arith.constant 1 : index
      air.dma_memcpy_nd (%2[%arg0, %arg1] [%c128_93, %c128_94] [%c128_95, %c1_96], %alloc_4[%c0_97, %c0_98, %c0_99, %c0_100] [%c2_101, %c64_102, %c2_103, %c64_104] [%c8192_105, %c64_106, %c4096_107, %c1_108]) : (memref<128x128xbf16>, memref<2x2x64x64xbf16, 1 : i32>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIEAIRDmaToAMDAIEDma (iree-amdaie-air-dma-to-amdaie-dma) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    memref.assume_alignment %0, 64 : memref<128x256xbf16>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    memref.assume_alignment %1, 64 : memref<256x128xbf16>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    memref.assume_alignment %2, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
      air.dma_memcpy_nd (%alloc_2[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %0[%c0, %c0, %arg0, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (memref<2x1x64x64xbf16, 1 : i32>, memref<128x256xbf16>)
      air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %1[%c0, %c0, %c0, %arg1] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (memref<1x2x64x64xbf16, 1 : i32>, memref<256x128xbf16>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %alloc_2[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<2x1x64x64xbf16, 1 : i32>)
        air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %alloc_1[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x2x64x64xbf16, 1 : i32>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %3 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c3 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        air.dma_memcpy_nd (%alloc_2[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %0[%c0, %c0, %arg0, %3] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (memref<2x1x64x64xbf16, 1 : i32>, memref<128x256xbf16>)
        %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %1[%c0, %c0, %4, %arg1] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (memref<1x2x64x64xbf16, 1 : i32>, memref<256x128xbf16>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %alloc_2[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<2x1x64x64xbf16, 1 : i32>)
          air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %alloc_1[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x2x64x64xbf16, 1 : i32>)
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %5 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %6 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %7 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %8, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      air.dma_memcpy_nd (%alloc_2[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %0[%c0, %c0, %arg0, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (memref<2x1x64x64xbf16, 1 : i32>, memref<128x256xbf16>)
      air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %1[%c0, %c0, %c192, %arg1] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (memref<1x2x64x64xbf16, 1 : i32>, memref<256x128xbf16>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %alloc_2[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (memref<1x1x8x16x4x8xbf16, 2 : i32>, memref<2x1x64x64xbf16, 1 : i32>)
        air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %alloc_1[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (memref<1x1x16x8x8x4xbf16, 2 : i32>, memref<1x2x64x64xbf16, 1 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %3 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %4 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %5 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %3, %4, %5 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %6, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        air.dma_memcpy_nd (%alloc_4[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %alloc_3[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (memref<2x2x64x64xbf16, 1 : i32>, memref<2x2x16x16x4x4xbf16, 2 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      air.dma_memcpy_nd (%2[%arg0, %arg1] [%c128, %c128] [%c128, %c1], %alloc_4[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (memref<128x128xbf16>, memref<2x2x64x64xbf16, 1 : i32>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIENormalizeLoopBounds (iree-amdaie-normalize-loop-bounds) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %7 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %8 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %9 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %10 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %11 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %12 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %13 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %14 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %17 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %19 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %20 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %21, 64 : memref<128x256xbf16>
    %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %25, 64 : memref<256x128xbf16>
    %29 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %29, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (128, 128) {
      %31 = amdaie.dma_cpy_nd(%12[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %22[%c0, %c0, %arg0, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %32 = amdaie.dma_cpy_nd(%6[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %26[%c0, %c0, %c0, %arg1] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %36 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %13[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %37 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %7[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %38 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %39 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %40 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %38, %39, %40 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %41, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c3 step %c1 {
        %36 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %37 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %23[%c0, %c0, %arg0, %36] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %38 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg2)
        %39 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %27[%c0, %c0, %38, %arg1] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %40 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %15[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
          %41 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %9[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %42 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %43 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %44 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %43, %44 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %45, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %33 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %24[%c0, %c0, %arg0, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %34 = amdaie.dma_cpy_nd(%10[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %28[%c0, %c0, %c192, %arg1] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %36 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %17[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %37 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %11[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %39 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %40 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %41 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %40, %41 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %42, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        %38 = amdaie.dma_cpy_nd(%19[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %18[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %35 = amdaie.dma_cpy_nd(%30[%arg0, %arg1] [%c128, %c128] [%c128, %c1], %20[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIEInsertCores (iree-amdaie-insert-cores) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %7 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %8 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %9 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %10 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %11 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %12 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %13 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %14 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %17 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %19 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %20 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %21, 64 : memref<128x256xbf16>
    %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %25, 64 : memref<256x128xbf16>
    %29 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %29, 64 : memref<128x128xbf16>
    %c1_5 = arith.constant 1 : index
    %c1_6 = arith.constant 1 : index
    %c1_7 = arith.constant 1 : index
    %c1_8 = arith.constant 1 : index
    scf.forall (%arg0, %arg1) in (1, 1) {
      %31 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
      %32 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
      %33 = amdaie.dma_cpy_nd(%12[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %22[%c0, %c0, %32, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %34 = amdaie.dma_cpy_nd(%6[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %26[%c0, %c0, %c0, %31] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %13[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %7[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %40 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %41 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %42 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %41, %42 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %43, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c2_9 = arith.constant 2 : index
      %c0_10 = arith.constant 0 : index
      scf.for %arg2 = %c0_10 to %c2_9 step %c1 {
        %38 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %39 = affine.apply affine_map<(d0) -> (d0 * 64)>(%38)
        %40 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %23[%c0, %c0, %32, %39] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %41 = affine.apply affine_map<(d0) -> (d0 * 64)>(%38)
        %42 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %27[%c0, %c0, %41, %31] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %43 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %15[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
          %44 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %9[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c16 step %c1 {
              scf.for %arg7 = %c0 to %c8 step %c1 {
                %45 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %46 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %47 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %47 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %48, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %35 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %24[%c0, %c0, %32, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %36 = amdaie.dma_cpy_nd(%10[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %28[%c0, %c0, %c192, %31] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %17[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %11[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        scf.for %arg4 = %c0 to %c16 step %c1 {
          scf.for %arg5 = %c0 to %c16 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              %41 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %42 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %43 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %42, %43 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %44, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        %40 = amdaie.dma_cpy_nd(%19[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %18[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %37 = amdaie.dma_cpy_nd(%30[%32, %31] [%c128, %c128] [%c128, %c1], %20[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIELocalizeLogicalObjectfifo (iree-amdaie-localize-logicalobjectfifo) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %7 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %8 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %9 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %10 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %11 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %12 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %13 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %14 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %17 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %19 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %20 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %21, 64 : memref<128x256xbf16>
    %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %25, 64 : memref<256x128xbf16>
    %29 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %29, 64 : memref<128x128xbf16>
    %c1_5 = arith.constant 1 : index
    %c1_6 = arith.constant 1 : index
    %c1_7 = arith.constant 1 : index
    %c1_8 = arith.constant 1 : index
    scf.forall (%arg0, %arg1) in (1, 1) {
      %31 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
      %32 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
      %33 = amdaie.dma_cpy_nd(%12[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %22[%c0, %c0, %32, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %34 = amdaie.dma_cpy_nd(%6[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %26[%c0, %c0, %c0, %31] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %13[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %7[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        %c2_11 = arith.constant 2 : index
        %40 = arith.addi %arg2, %c2_11 : index
        %tile = amdaie.tile(%arg3, %40)
        %41 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%38)
          amdaie.logicalobjectfifo.consume(%39)
          linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                %42 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %43 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %44 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %43, %44 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %45, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c2_9 = arith.constant 2 : index
      %c0_10 = arith.constant 0 : index
      scf.for %arg2 = %c0_10 to %c2_9 step %c1 {
        %38 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %39 = affine.apply affine_map<(d0) -> (d0 * 64)>(%38)
        %40 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %23[%c0, %c0, %32, %39] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %41 = affine.apply affine_map<(d0) -> (d0 * 64)>(%38)
        %42 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %27[%c0, %c0, %41, %31] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %43 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %15[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
          %44 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %9[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
          %c2_11 = arith.constant 2 : index
          %45 = arith.addi %arg3, %c2_11 : index
          %tile = amdaie.tile(%arg4, %45)
          %46 = amdaie.core(%tile) {
            amdaie.logicalobjectfifo.consume(%43)
            amdaie.logicalobjectfifo.consume(%44)
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c16 step %c1 {
                scf.for %arg7 = %c0 to %c8 step %c1 {
                  %47 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                  %48 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                  %49 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                  %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  vector.transfer_write %50, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
                }
              }
            }
            amdaie.end
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %35 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %24[%c0, %c0, %32, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %36 = amdaie.dma_cpy_nd(%10[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %28[%c0, %c0, %c192, %31] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %17[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %11[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %40 = amdaie.dma_cpy_nd(%19[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %18[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>)
        %c2_11 = arith.constant 2 : index
        %41 = arith.addi %arg2, %c2_11 : index
        %tile = amdaie.tile(%arg3, %41)
        %42 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%38)
          amdaie.logicalobjectfifo.consume(%39)
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                %43 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %44 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %45 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %46 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %43, %44, %45 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %46, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.logicalobjectfifo.produce(%40)
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %37 = amdaie.dma_cpy_nd(%30[%32, %31] [%c128, %c128] [%c128, %c1], %20[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %8 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %9 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    %10 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %7, 64 : memref<128x256xbf16>
    %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %12 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %13 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    %14 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %11, 64 : memref<256x128xbf16>
    %15 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %16 = amdaie.logicalobjectfifo.from_memref %15, {} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %15, 64 : memref<128x128xbf16>
    %c1_5 = arith.constant 1 : index
    %c1_6 = arith.constant 1 : index
    %c1_7 = arith.constant 1 : index
    %c1_8 = arith.constant 1 : index
    scf.forall (%arg0, %arg1) in (1, 1) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
      %18 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
      %19 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %8[%c0, %c0, %18, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %20 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %12[%c0, %c0, %c0, %17] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %21 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %22 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %23 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
      %24 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %34 = amdaie.dma_cpy_nd(%22[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %24[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %35 = amdaie.dma_cpy_nd(%21[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %23[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        %c2_11 = arith.constant 2 : index
        %36 = arith.addi %arg2, %c2_11 : index
        %tile = amdaie.tile(%arg3, %36)
        %37 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%34)
          amdaie.logicalobjectfifo.consume(%35)
          linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                %38 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %39 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %40 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %38, %39, %40 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %41, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c2_9 = arith.constant 2 : index
      %c0_10 = arith.constant 0 : index
      scf.for %arg2 = %c0_10 to %c2_9 step %c1 {
        %34 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %35 = affine.apply affine_map<(d0) -> (d0 * 64)>(%34)
        %36 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %9[%c0, %c0, %18, %35] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %37 = affine.apply affine_map<(d0) -> (d0 * 64)>(%34)
        %38 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %13[%c0, %c0, %37, %17] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        %39 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
        %40 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
        %41 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
        %42 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
        scf.forall (%arg3, %arg4) in (2, 2) {
          %43 = amdaie.dma_cpy_nd(%40[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %42[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
          %44 = amdaie.dma_cpy_nd(%39[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %41[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
          %c2_11 = arith.constant 2 : index
          %45 = arith.addi %arg3, %c2_11 : index
          %tile = amdaie.tile(%arg4, %45)
          %46 = amdaie.core(%tile) {
            amdaie.logicalobjectfifo.consume(%43)
            amdaie.logicalobjectfifo.consume(%44)
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c16 step %c1 {
                scf.for %arg7 = %c0 to %c8 step %c1 {
                  %47 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                  %48 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                  %49 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                  %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  vector.transfer_write %50, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
                }
              }
            }
            amdaie.end
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %25 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %10[%c0, %c0, %18, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %26 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %14[%c0, %c0, %c192, %17] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %27 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %28 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %29 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
      %30 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
      %31 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>
      %32 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %34 = amdaie.dma_cpy_nd(%28[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %30[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %35 = amdaie.dma_cpy_nd(%27[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %29[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %36 = amdaie.dma_cpy_nd(%32[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %31[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>)
        %c2_11 = arith.constant 2 : index
        %37 = arith.addi %arg2, %c2_11 : index
        %tile = amdaie.tile(%arg3, %37)
        %38 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%34)
          amdaie.logicalobjectfifo.consume(%35)
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                %39 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %40 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %41 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %40, %41 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %42, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.logicalobjectfifo.produce(%36)
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %33 = amdaie.dma_cpy_nd(%16[%18, %17] [%c128, %c128] [%c128, %c1], %6[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIEDistributeCoresAndObjectFifos (iree-amdaie-distribute-cores-and-objectfifos) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %4 = amdaie.logicalobjectfifo.from_memref %3, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %3, 64 : memref<128x256xbf16>
    %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %6 = amdaie.logicalobjectfifo.from_memref %5, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %5, 64 : memref<256x128xbf16>
    %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %8 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %7, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) in (1, 1) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
      %10 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
      %11 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %12 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c0, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %19 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %20 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 16, 16, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x16x16x4x4xbf16, 2 : i32> to memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>
        %21 = arith.addi %arg2, %c2 : index
        %tile = amdaie.tile(%arg3, %21)
        %22 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%19)
          amdaie.logicalobjectfifo.consume(%20)
          linalg.fill ins(%cst : bf16) outs(%subview : memref<1x1x16x16x4x4xbf16, strided<[8192, 4096, 256, 16, 4, 1], offset: ?>, 2 : i32>)
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                %23 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %24 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %25 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %26 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %23, %24, %25 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %26, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c0 to %c2 step %c1 {
        %19 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %20 = affine.apply affine_map<(d0) -> (d0 * 64)>(%19)
        %21 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %20] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %22 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %20, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %23 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
          %24 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
          %25 = arith.addi %arg3, %c2 : index
          %tile = amdaie.tile(%arg4, %25)
          %26 = amdaie.core(%tile) {
            amdaie.logicalobjectfifo.consume(%23)
            amdaie.logicalobjectfifo.consume(%24)
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c16 step %c1 {
                scf.for %arg7 = %c0 to %c8 step %c1 {
                  %27 = vector.transfer_read %alloc_0[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                  %28 = vector.transfer_read %alloc[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                  %29 = vector.transfer_read %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                  %30 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %28, %29 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                  vector.transfer_write %30, %alloc_3[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
                }
              }
            }
            amdaie.end
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %15 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %16 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c192, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %17 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %19 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %20 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %21 = amdaie.dma_cpy_nd(%2[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %17[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x16x16x4x4xbf16, 2 : i32>>)
        %22 = arith.addi %arg2, %c2 : index
        %tile = amdaie.tile(%arg3, %22)
        %23 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%19)
          amdaie.logicalobjectfifo.consume(%20)
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c16 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                %24 = vector.transfer_read %alloc_0[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %25 = vector.transfer_read %alloc[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %26 = vector.transfer_read %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %27 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %24, %25, %26 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %27, %alloc_3[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<2x2x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.logicalobjectfifo.produce(%21)
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %18 = amdaie.dma_cpy_nd(%8[%10, %9] [%c128, %c128] [%c128, %c1], %2[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c3 = arith.constant 3 : index
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %tile = amdaie.tile(%c0, %c1)
    %tile_3 = amdaie.tile(%c1, %c1)
    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_4 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_5 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %tile_6 = amdaie.tile(%c0, %c1)
    %tile_7 = amdaie.tile(%c1, %c1)
    %1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_6} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_8 = memref.alloc() : memref<1x1x16x16x4x4xbf16, 2 : i32>
    %alloc_9 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_10 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %tile_11 = amdaie.tile(%c0, %c1)
    %tile_12 = amdaie.tile(%c1, %c1)
    %2 = amdaie.logicalobjectfifo.from_memref %alloc_9, {%tile_11} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %tile_13 = amdaie.tile(%c0, %c0)
    %tile_14 = amdaie.tile(%c1, %c0)
    %4 = amdaie.logicalobjectfifo.from_memref %3, {%tile_13} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %3, 64 : memref<128x256xbf16>
    %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %tile_15 = amdaie.tile(%c0, %c0)
    %tile_16 = amdaie.tile(%c1, %c0)
    %6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_15} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %5, 64 : memref<256x128xbf16>
    %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %tile_17 = amdaie.tile(%c0, %c0)
    %tile_18 = amdaie.tile(%c1, %c0)
    %8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_17} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %7, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) in (1, 1) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
      %10 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
      %11 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %12 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c0, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %tile_19 = amdaie.tile(%c1, %c3)
      %tile_20 = amdaie.tile(%c1, %c2)
      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_20, %tile_19} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %tile_21 = amdaie.tile(%c1, %c3)
      %tile_22 = amdaie.tile(%c1, %c2)
      %14 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_22, %tile_21} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %tile_23 = amdaie.tile(%c1, %c3)
      %tile_24 = amdaie.tile(%c1, %c2)
      %15 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_24, %tile_23} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %tile_25 = amdaie.tile(%c0, %c3)
      %tile_26 = amdaie.tile(%c0, %c2)
      %16 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_26, %tile_25} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %tile_27 = amdaie.tile(%c1, %c3)
      %tile_28 = amdaie.tile(%c0, %c3)
      %17 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_28, %tile_27} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %tile_29 = amdaie.tile(%c1, %c3)
      %tile_30 = amdaie.tile(%c0, %c3)
      %18 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_30, %tile_29} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %tile_31 = amdaie.tile(%c1, %c3)
      %tile_32 = amdaie.tile(%c0, %c3)
      %19 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_32, %tile_31} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %tile_33 = amdaie.tile(%c1, %c2)
      %tile_34 = amdaie.tile(%c0, %c2)
      %20 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_34, %tile_33} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %21 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %22 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %23 = amdaie.dma_cpy_nd(%20[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %tile_35 = amdaie.tile(%c0, %c2)
      %tile_36 = amdaie.tile(%c0, %c2)
      %24 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_36} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %tile_37 = amdaie.tile(%c0, %c2)
      %25 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_37} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %26 = amdaie.core(%tile_35) {
        %56 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %59 = amdaie.logicalobjectfifo.access(%24, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%23)
        amdaie.logicalobjectfifo.consume(%21)
        linalg.fill ins(%cst : bf16) outs(%59 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %60 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %61 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %62 = vector.transfer_read %56[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %63, %56[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %tile_38 = amdaie.tile(%c1, %c2)
      %tile_39 = amdaie.tile(%c1, %c2)
      %27 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_39} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %tile_40 = amdaie.tile(%c1, %c2)
      %28 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_40} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %29 = amdaie.core(%tile_38) {
        %56 = amdaie.logicalobjectfifo.access(%28, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %59 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%23)
        amdaie.logicalobjectfifo.consume(%22)
        linalg.fill ins(%cst : bf16) outs(%59 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %60 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %61 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %62 = vector.transfer_read %56[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %63, %56[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %30 = amdaie.dma_cpy_nd(%19[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %tile_41 = amdaie.tile(%c0, %c3)
      %tile_42 = amdaie.tile(%c0, %c3)
      %31 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_42} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %tile_43 = amdaie.tile(%c0, %c3)
      %32 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_43} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %33 = amdaie.core(%tile_41) {
        %56 = amdaie.logicalobjectfifo.access(%32, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%19, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %59 = amdaie.logicalobjectfifo.access(%31, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%30)
        amdaie.logicalobjectfifo.consume(%21)
        linalg.fill ins(%cst : bf16) outs(%59 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %60 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %61 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %62 = vector.transfer_read %56[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %63, %56[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %tile_44 = amdaie.tile(%c1, %c3)
      %tile_45 = amdaie.tile(%c1, %c3)
      %34 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_45} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %tile_46 = amdaie.tile(%c1, %c3)
      %35 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_46} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %36 = amdaie.core(%tile_44) {
        %56 = amdaie.logicalobjectfifo.access(%35, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%19, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %59 = amdaie.logicalobjectfifo.access(%34, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%30)
        amdaie.logicalobjectfifo.consume(%22)
        linalg.fill ins(%cst : bf16) outs(%59 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %60 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %61 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %62 = vector.transfer_read %56[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %62 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %63, %56[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      scf.for %arg2 = %c0 to %c2 step %c1 {
        %56 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %57 = affine.apply affine_map<(d0) -> (d0 * 64)>(%56)
        %58 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %57] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %59 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %57, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        %60 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %61 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %62 = amdaie.dma_cpy_nd(%20[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %tile_55 = amdaie.tile(%c0, %c2)
        %tile_56 = amdaie.tile(%c0, %c2)
        %63 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_56} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
        %64 = amdaie.core(%tile_55) {
          %72 = amdaie.logicalobjectfifo.access(%63, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %73 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %74 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%62)
          amdaie.logicalobjectfifo.consume(%60)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %75 = vector.transfer_read %74[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %76 = vector.transfer_read %73[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %77 = vector.transfer_read %72[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %75, %76, %77 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %78, %72[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %tile_57 = amdaie.tile(%c1, %c2)
        %tile_58 = amdaie.tile(%c1, %c2)
        %65 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_58} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
        %66 = amdaie.core(%tile_57) {
          %72 = amdaie.logicalobjectfifo.access(%65, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %73 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %74 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%62)
          amdaie.logicalobjectfifo.consume(%61)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %75 = vector.transfer_read %74[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %76 = vector.transfer_read %73[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %77 = vector.transfer_read %72[%c0, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %75, %76, %77 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %78, %72[%c0, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %67 = amdaie.dma_cpy_nd(%18[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %tile_59 = amdaie.tile(%c0, %c3)
        %tile_60 = amdaie.tile(%c0, %c3)
        %68 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_60} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
        %69 = amdaie.core(%tile_59) {
          %72 = amdaie.logicalobjectfifo.access(%68, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %73 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %74 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%67)
          amdaie.logicalobjectfifo.consume(%60)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %75 = vector.transfer_read %74[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %76 = vector.transfer_read %73[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %77 = vector.transfer_read %72[%c1, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %75, %76, %77 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %78, %72[%c1, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %tile_61 = amdaie.tile(%c1, %c3)
        %tile_62 = amdaie.tile(%c1, %c3)
        %70 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_62} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
        %71 = amdaie.core(%tile_61) {
          %72 = amdaie.logicalobjectfifo.access(%70, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %73 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %74 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%67)
          amdaie.logicalobjectfifo.consume(%61)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %75 = vector.transfer_read %74[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %76 = vector.transfer_read %73[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %77 = vector.transfer_read %72[%c1, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %75, %76, %77 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %78, %72[%c1, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
      }
      %37 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %38 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c192, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %tile_47 = amdaie.tile(%c1, %c3)
      %39 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_47} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %tile_48 = amdaie.tile(%c1, %c2)
      %40 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_48} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %tile_49 = amdaie.tile(%c0, %c3)
      %41 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_49} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %tile_50 = amdaie.tile(%c0, %c2)
      %42 = amdaie.logicalobjectfifo.from_memref %alloc_8, {%tile_50} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %43 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %44 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %45 = amdaie.dma_cpy_nd(%20[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %46 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %42[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %tile_51 = amdaie.tile(%c0, %c2)
      %47 = amdaie.core(%tile_51) {
        %56 = amdaie.logicalobjectfifo.access(%42, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%45)
        amdaie.logicalobjectfifo.consume(%43)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %59 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %60 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %61 = vector.transfer_read %56[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %62, %56[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%46)
        amdaie.end
      }
      %48 = amdaie.dma_cpy_nd(%2[%c0, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %40[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %tile_52 = amdaie.tile(%c1, %c2)
      %49 = amdaie.core(%tile_52) {
        %56 = amdaie.logicalobjectfifo.access(%40, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%45)
        amdaie.logicalobjectfifo.consume(%44)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %59 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %60 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %61 = vector.transfer_read %56[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %62, %56[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%48)
        amdaie.end
      }
      %50 = amdaie.dma_cpy_nd(%17[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %51 = amdaie.dma_cpy_nd(%2[%c1, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %41[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %tile_53 = amdaie.tile(%c0, %c3)
      %52 = amdaie.core(%tile_53) {
        %56 = amdaie.logicalobjectfifo.access(%41, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%50)
        amdaie.logicalobjectfifo.consume(%43)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %59 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %60 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %61 = vector.transfer_read %56[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %62, %56[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%51)
        amdaie.end
      }
      %53 = amdaie.dma_cpy_nd(%2[%c1, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %39[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %tile_54 = amdaie.tile(%c1, %c3)
      %54 = amdaie.core(%tile_54) {
        %56 = amdaie.logicalobjectfifo.access(%39, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %58 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%50)
        amdaie.logicalobjectfifo.consume(%44)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %59 = vector.transfer_read %58[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %60 = vector.transfer_read %57[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %61 = vector.transfer_read %56[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %62, %56[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%53)
        amdaie.end
      }
      %55 = amdaie.dma_cpy_nd(%8[%10, %9] [%c128, %c128] [%c128, %c1], %2[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_10 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    memref.dealloc %alloc_8 : memref<1x1x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_4 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_9 : memref<2x2x64x64xbf16, 1 : i32>
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c3 = arith.constant 3 : index
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %alloc_2 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %tile = amdaie.tile(%c0, %c1)
    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %alloc_4 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_5 = memref.alloc() : memref<1x1x16x16x4x4xbf16, 2 : i32>
    %alloc_6 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %alloc_7 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %tile_8 = amdaie.tile(%c0, %c0)
    %4 = amdaie.logicalobjectfifo.from_memref %3, {%tile_8} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %3, 64 : memref<128x256xbf16>
    %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_8} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %5, 64 : memref<256x128xbf16>
    %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_8} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %7, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) in (1, 1) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
      %10 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
      %11 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %12 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c0, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %tile_9 = amdaie.tile(%c1, %c3)
      %tile_10 = amdaie.tile(%c1, %c2)
      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_10, %tile_9} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %tile_11 = amdaie.tile(%c0, %c3)
      %tile_12 = amdaie.tile(%c0, %c2)
      %14 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_12, %tile_11} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %15 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_11, %tile_9} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %16 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_12, %tile_10} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %17 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %18 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %19 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %20 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_12} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %21 = amdaie.core(%tile_12) {
        %44 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%19)
        amdaie.logicalobjectfifo.consume(%17)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %22 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_10} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %23 = amdaie.core(%tile_10) {
        %44 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%19)
        amdaie.logicalobjectfifo.consume(%18)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %24 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %25 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_11} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %26 = amdaie.core(%tile_11) {
        %44 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%24)
        amdaie.logicalobjectfifo.consume(%17)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %27 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_9} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %28 = amdaie.core(%tile_9) {
        %44 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%24)
        amdaie.logicalobjectfifo.consume(%18)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      scf.for %arg2 = %c0 to %c2 step %c1 {
        %44 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %45 = affine.apply affine_map<(d0) -> (d0 * 64)>(%44)
        %46 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %45] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %47 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %45, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        %48 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %49 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %50 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %51 = amdaie.core(%tile_12) {
          %56 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %58 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%50)
          amdaie.logicalobjectfifo.consume(%48)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %59 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %60 = vector.transfer_read %57[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %61 = vector.transfer_read %56[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %62, %56[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %52 = amdaie.core(%tile_10) {
          %56 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %58 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%50)
          amdaie.logicalobjectfifo.consume(%49)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %59 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %60 = vector.transfer_read %57[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %61 = vector.transfer_read %56[%c0, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %62, %56[%c0, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %53 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %54 = amdaie.core(%tile_11) {
          %56 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %58 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%53)
          amdaie.logicalobjectfifo.consume(%48)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %59 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %60 = vector.transfer_read %57[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %61 = vector.transfer_read %56[%c1, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %62, %56[%c1, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %55 = amdaie.core(%tile_9) {
          %56 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %58 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%53)
          amdaie.logicalobjectfifo.consume(%49)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %59 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %60 = vector.transfer_read %57[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %61 = vector.transfer_read %56[%c1, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %62, %56[%c1, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
      }
      %29 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %30 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c192, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %31 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %32 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %33 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %34 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %20[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %35 = amdaie.core(%tile_12) {
        %44 = amdaie.logicalobjectfifo.access(%20, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%33)
        amdaie.logicalobjectfifo.consume(%31)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%34)
        amdaie.end
      }
      %36 = amdaie.dma_cpy_nd(%2[%c0, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %22[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %37 = amdaie.core(%tile_10) {
        %44 = amdaie.logicalobjectfifo.access(%22, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%33)
        amdaie.logicalobjectfifo.consume(%32)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%36)
        amdaie.end
      }
      %38 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %39 = amdaie.dma_cpy_nd(%2[%c1, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %25[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %40 = amdaie.core(%tile_11) {
        %44 = amdaie.logicalobjectfifo.access(%25, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%38)
        amdaie.logicalobjectfifo.consume(%31)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%39)
        amdaie.end
      }
      %41 = amdaie.dma_cpy_nd(%2[%c1, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %27[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %42 = amdaie.core(%tile_9) {
        %44 = amdaie.logicalobjectfifo.access(%27, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%38)
        amdaie.logicalobjectfifo.consume(%32)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%41)
        amdaie.end
      }
      %43 = amdaie.dma_cpy_nd(%8[%10, %9] [%c128, %c128] [%c128, %c1], %2[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_7 : memref<2x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_4 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    memref.dealloc %alloc_5 : memref<1x1x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_6 : memref<2x2x64x64xbf16, 1 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIEDmaToCircularDma (iree-amdaie-dma-to-circular-dma) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c3 = arith.constant 3 : index
    %c192 = arith.constant 192 : index
    %c32 = arith.constant 32 : index
    %c512 = arith.constant 512 : index
    %c4 = arith.constant 4 : index
    %c128 = arith.constant 128 : index
    %c8192 = arith.constant 8192 : index
    %c256 = arith.constant 256 : index
    %c16384 = arith.constant 16384 : index
    %c4096 = arith.constant 4096 : index
    %c64 = arith.constant 64 : index
    %c2 = arith.constant 2 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : bf16
    %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
    %tile = amdaie.tile(%c0, %c1)
    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<1x1x16x16x4x4xbf16, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
    %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
    %tile_5 = amdaie.tile(%c0, %c0)
    %4 = amdaie.logicalobjectfifo.from_memref %3, {%tile_5} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
    memref.assume_alignment %3, 64 : memref<128x256xbf16>
    %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
    %6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_5} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
    memref.assume_alignment %5, 64 : memref<256x128xbf16>
    %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
    %8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_5} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
    memref.assume_alignment %7, 64 : memref<128x128xbf16>
    scf.forall (%arg0, %arg1) in (1, 1) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
      %10 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
      %11 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %12 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c0, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %tile_6 = amdaie.tile(%c1, %c3)
      %tile_7 = amdaie.tile(%c1, %c2)
      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_7, %tile_6} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %tile_8 = amdaie.tile(%c0, %c3)
      %tile_9 = amdaie.tile(%c0, %c2)
      %14 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_9, %tile_8} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
      %15 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_8, %tile_6} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %16 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_9, %tile_7} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
      %17 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %18 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %19 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %20 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_9} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %21 = amdaie.core(%tile_9) {
        %44 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%19)
        amdaie.logicalobjectfifo.consume(%17)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %22 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_7} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %23 = amdaie.core(%tile_7) {
        %44 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%19)
        amdaie.logicalobjectfifo.consume(%18)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %24 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %25 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_8} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %26 = amdaie.core(%tile_8) {
        %44 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%24)
        amdaie.logicalobjectfifo.consume(%17)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %27 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
      %28 = amdaie.core(%tile_6) {
        %44 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        %47 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%24)
        amdaie.logicalobjectfifo.consume(%18)
        linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %50 = vector.transfer_read %44[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %51, %44[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      scf.for %arg2 = %c0 to %c2 step %c1 {
        %44 = affine.apply affine_map<(d0) -> (d0 * 64 + 64)>(%arg2)
        %45 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %44] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
        %46 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %44, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
        %47 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %48 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
        %49 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %50 = amdaie.core(%tile_9) {
          %55 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %56 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%49)
          amdaie.logicalobjectfifo.consume(%47)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %60 = vector.transfer_read %55[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %61, %55[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %51 = amdaie.core(%tile_7) {
          %55 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %56 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%49)
          amdaie.logicalobjectfifo.consume(%48)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %60 = vector.transfer_read %55[%c0, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %61, %55[%c0, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %52 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
        %53 = amdaie.core(%tile_8) {
          %55 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %56 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%52)
          amdaie.logicalobjectfifo.consume(%47)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %60 = vector.transfer_read %55[%c1, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %61, %55[%c1, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
        %54 = amdaie.core(%tile_6) {
          %55 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
          %56 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
          %57 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
          amdaie.logicalobjectfifo.consume(%52)
          amdaie.logicalobjectfifo.consume(%48)
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c16 step %c1 {
              scf.for %arg5 = %c0 to %c8 step %c1 {
                %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
                %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
                %60 = vector.transfer_read %55[%c1, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
                %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
                vector.transfer_write %61, %55[%c1, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
              }
            }
          }
          amdaie.end
        }
      }
      %29 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %30 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c192, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %31 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %32 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %33 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %34 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %20[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %35 = amdaie.core(%tile_9) {
        %44 = amdaie.logicalobjectfifo.access(%20, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%33)
        amdaie.logicalobjectfifo.consume(%31)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%34)
        amdaie.end
      }
      %36 = amdaie.dma_cpy_nd(%2[%c0, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %22[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %37 = amdaie.core(%tile_7) {
        %44 = amdaie.logicalobjectfifo.access(%22, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%33)
        amdaie.logicalobjectfifo.consume(%32)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%36)
        amdaie.end
      }
      %38 = amdaie.dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %39 = amdaie.dma_cpy_nd(%2[%c1, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %25[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %40 = amdaie.core(%tile_8) {
        %44 = amdaie.logicalobjectfifo.access(%25, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%38)
        amdaie.logicalobjectfifo.consume(%31)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%39)
        amdaie.end
      }
      %41 = amdaie.dma_cpy_nd(%2[%c1, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %27[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
      %42 = amdaie.core(%tile_6) {
        %44 = amdaie.logicalobjectfifo.access(%27, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%38)
        amdaie.logicalobjectfifo.consume(%32)
        scf.for %arg2 = %c0 to %c16 step %c1 {
          scf.for %arg3 = %c0 to %c16 step %c1 {
            scf.for %arg4 = %c0 to %c8 step %c1 {
              %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %49 = vector.transfer_read %44[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %50, %44[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.logicalobjectfifo.produce(%41)
        amdaie.end
      }
      %43 = amdaie.dma_cpy_nd(%8[%10, %9] [%c128, %c128] [%c128, %c1], %2[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
    memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
    memref.dealloc %alloc_3 : memref<1x1x16x16x4x4xbf16, 2 : i32>
    memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
    memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
    return
  }
}

// -----// IR Dump Before AMDAIECreateAIEWorkgroup (iree-amdaie-create-aie-workgroup) //----- //
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c3 = arith.constant 3 : index
  %c192 = arith.constant 192 : index
  %c32 = arith.constant 32 : index
  %c512 = arith.constant 512 : index
  %c4 = arith.constant 4 : index
  %c128 = arith.constant 128 : index
  %c8192 = arith.constant 8192 : index
  %c256 = arith.constant 256 : index
  %c16384 = arith.constant 16384 : index
  %c4096 = arith.constant 4096 : index
  %c64 = arith.constant 64 : index
  %c2 = arith.constant 2 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : bf16
  %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
  %tile = amdaie.tile(%c0, %c1)
  %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
  %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
  %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
  %alloc_3 = memref.alloc() : memref<1x1x16x16x4x4xbf16, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
  %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
  %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
  %tile_5 = amdaie.tile(%c0, %c0)
  %4 = amdaie.logicalobjectfifo.from_memref %3, {%tile_5} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>>
  memref.assume_alignment %3, 64 : memref<128x256xbf16>
  %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xbf16>
  %6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_5} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>>
  memref.assume_alignment %5, 64 : memref<256x128xbf16>
  %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xbf16>
  %8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_5} : memref<128x128xbf16> -> !amdaie.logicalobjectfifo<memref<128x128xbf16>>
  memref.assume_alignment %7, 64 : memref<128x128xbf16>
  scf.forall (%arg0, %arg1) in (1, 1) {
    %9 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
    %10 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg0)
    %11 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c0] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
    %12 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c0, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
    %tile_6 = amdaie.tile(%c1, %c3)
    %tile_7 = amdaie.tile(%c1, %c2)
    %13 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_7, %tile_6} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %tile_8 = amdaie.tile(%c0, %c3)
    %tile_9 = amdaie.tile(%c0, %c2)
    %14 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_9, %tile_8} : memref<1x1x16x8x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>
    %15 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_8, %tile_6} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %16 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_9, %tile_7} : memref<1x1x8x16x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>
    %17 = amdaie.circular_dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
    %18 = amdaie.circular_dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
    %19 = amdaie.circular_dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
    %20 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_9} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
    %21 = amdaie.core(%tile_9) {
      %44 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      %47 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%19)
      amdaie.logicalobjectfifo.consume(%17)
      linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %50 = vector.transfer_read %44[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %51, %44[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.end
    }
    %22 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_7} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
    %23 = amdaie.core(%tile_7) {
      %44 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      %47 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%19)
      amdaie.logicalobjectfifo.consume(%18)
      linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %50 = vector.transfer_read %44[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %51, %44[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.end
    }
    %24 = amdaie.circular_dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
    %25 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_8} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
    %26 = amdaie.core(%tile_8) {
      %44 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      %47 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%24)
      amdaie.logicalobjectfifo.consume(%17)
      linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %50 = vector.transfer_read %44[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %51, %44[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.end
    }
    %27 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x16x16x4x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>
    %28 = amdaie.core(%tile_6) {
      %44 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      %47 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%24)
      amdaie.logicalobjectfifo.consume(%18)
      linalg.fill ins(%cst : bf16) outs(%47 : memref<1x1x16x16x4x4xbf16, 2 : i32>)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %48 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %49 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %50 = vector.transfer_read %44[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %50 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %51, %44[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.end
    }
    scf.for %arg2 = %c0 to %c2 step %c1 {
      %44 = affine.apply affine_map<(d0) -> (d0 * 64 + 64)>(%arg2)
      %45 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %44] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
      %46 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %44, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
      %47 = amdaie.circular_dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %48 = amdaie.circular_dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
      %49 = amdaie.circular_dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %50 = amdaie.core(%tile_9) {
        %55 = amdaie.logicalobjectfifo.access(%20, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %56 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%49)
        amdaie.logicalobjectfifo.consume(%47)
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %60 = vector.transfer_read %55[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %61, %55[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %51 = amdaie.core(%tile_7) {
        %55 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %56 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%49)
        amdaie.logicalobjectfifo.consume(%48)
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %60 = vector.transfer_read %55[%c0, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %61, %55[%c0, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %52 = amdaie.circular_dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
      %53 = amdaie.core(%tile_8) {
        %55 = amdaie.logicalobjectfifo.access(%25, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %56 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%52)
        amdaie.logicalobjectfifo.consume(%47)
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %60 = vector.transfer_read %55[%c1, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %61, %55[%c1, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
      %54 = amdaie.core(%tile_6) {
        %55 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
        %56 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
        %57 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
        amdaie.logicalobjectfifo.consume(%52)
        amdaie.logicalobjectfifo.consume(%48)
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c16 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              %58 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
              %59 = vector.transfer_read %56[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
              %60 = vector.transfer_read %55[%c1, %c1, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
              %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %59, %60 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
              vector.transfer_write %61, %55[%c1, %c1, %arg4, %arg3, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
            }
          }
        }
        amdaie.end
      }
    }
    %29 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1], %4[%c0, %c0, %10, %c192] [%c2, %c1, %c64, %c64] [%c16384, %c64, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>)
    %30 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c64, %c64] [%c8192, %c4096, %c64, %c1], %6[%c0, %c0, %c192, %9] [%c1, %c2, %c64, %c64] [%c8192, %c64, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>)
    %31 = amdaie.circular_dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
    %32 = amdaie.circular_dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c4096, %c4096, %c256, %c32, %c4, %c1], %0[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c8, %c8, %c4] [%c8192, %c4096, %c4, %c512, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>)
    %33 = amdaie.circular_dma_cpy_nd(%16[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
    %34 = amdaie.circular_dma_cpy_nd(%2[%c0, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %20[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
    %35 = amdaie.core(%tile_9) {
      %44 = amdaie.logicalobjectfifo.access(%20, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%33)
      amdaie.logicalobjectfifo.consume(%31)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %49 = vector.transfer_read %44[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %50, %44[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.logicalobjectfifo.produce(%34)
      amdaie.end
    }
    %36 = amdaie.circular_dma_cpy_nd(%2[%c0, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %22[%c0, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
    %37 = amdaie.core(%tile_7) {
      %44 = amdaie.logicalobjectfifo.access(%22, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%33)
      amdaie.logicalobjectfifo.consume(%32)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %49 = vector.transfer_read %44[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %50, %44[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.logicalobjectfifo.produce(%36)
      amdaie.end
    }
    %38 = amdaie.circular_dma_cpy_nd(%15[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c512, %c32, %c8, %c1], %1[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16, %c4, %c8] [%c4096, %c4096, %c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>)
    %39 = amdaie.circular_dma_cpy_nd(%2[%c1, %c0, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %25[%c1, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
    %40 = amdaie.core(%tile_8) {
      %44 = amdaie.logicalobjectfifo.access(%25, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%14, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%38)
      amdaie.logicalobjectfifo.consume(%31)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %49 = vector.transfer_read %44[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, true, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %50, %44[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.logicalobjectfifo.produce(%39)
      amdaie.end
    }
    %41 = amdaie.circular_dma_cpy_nd(%2[%c1, %c1, %c0, %c0] [%c1, %c1, %c64, %c64] [%c8192, %c4096, %c64, %c1], %27[%c1, %c1, %c0, %c0, %c0, %c0] [%c1, %c1, %c16, %c4, %c16, %c4] [%c8192, %c4096, %c16, %c4, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>>)
    %42 = amdaie.core(%tile_6) {
      %44 = amdaie.logicalobjectfifo.access(%27, Write) : !amdaie.logicalobjectfifo<memref<1x1x16x16x4x4xbf16, 2 : i32>> -> memref<1x1x16x16x4x4xbf16, 2 : i32>
      %45 = amdaie.logicalobjectfifo.access(%13, Read) : !amdaie.logicalobjectfifo<memref<1x1x16x8x8x4xbf16, 2 : i32>> -> memref<1x1x16x8x8x4xbf16, 2 : i32>
      %46 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x16x4x8xbf16, 2 : i32>> -> memref<1x1x8x16x4x8xbf16, 2 : i32>
      amdaie.logicalobjectfifo.consume(%38)
      amdaie.logicalobjectfifo.consume(%32)
      scf.for %arg2 = %c0 to %c16 step %c1 {
        scf.for %arg3 = %c0 to %c16 step %c1 {
          scf.for %arg4 = %c0 to %c8 step %c1 {
            %47 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x16x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16>
            %48 = vector.transfer_read %45[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x16x8x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16>
            %49 = vector.transfer_read %44[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [false, false, false, false, true, true]} : memref<1x1x16x16x4x4xbf16, 2 : i32>, vector<1x1x1x1x4x4xbf16>
            %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %48, %49 : vector<1x1x1x1x4x8xbf16>, vector<1x1x1x1x8x4xbf16> into vector<1x1x1x1x4x4xbf16>
            vector.transfer_write %50, %44[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xbf16>, memref<1x1x16x16x4x4xbf16, 2 : i32>
          }
        }
      }
      amdaie.logicalobjectfifo.produce(%41)
      amdaie.end
    }
    %43 = amdaie.dma_cpy_nd(%8[%10, %9] [%c128, %c128] [%c128, %c1], %2[%c0, %c0, %c0, %c0] [%c2, %c64, %c2, %c64] [%c8192, %c64, %c4096, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xbf16>>, !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>)
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_0 : memref<1x1x8x16x4x8xbf16, 2 : i32>
  memref.dealloc %alloc : memref<1x1x16x8x8x4xbf16, 2 : i32>
  memref.dealloc %alloc_3 : memref<1x1x16x16x4x4xbf16, 2 : i32>
  memref.dealloc %alloc_1 : memref<1x2x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_2 : memref<2x1x64x64xbf16, 1 : i32>
  memref.dealloc %alloc_4 : memref<2x2x64x64xbf16, 1 : i32>
  return
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    amdaie.workgroup {
      %c3 = arith.constant 3 : index
      %c192 = arith.constant 192 : index
      %c32 = arith.constant 32 : index
      %c512 = arith.constant 512 : index
      %c4 = arith.constant 4 : index
      %c128 = arith.constant 128 : index
      %c8192 = arith.constant 8192 : index
      %c256 = arith.constant 256 : index
      %c16384 = arith.constant 16384 : index
      %c4096 = arith.constant 4096 : index
      %c64 = arith.constant 64 : index
      %c2 = arith.constant 2 : index
      %c8 = arith.constant 8 : index
      %c16 = arith.constant 16 : index
      %c1 = arith.constant 1 : index
      %c0 = arith.constant 0 : index
      %cst = arith.constant 0.000000e+00 : bf16
      %alloc = memref.alloc() : memref<1x1x16x8x8x4xbf16, 2 : i32>
      %alloc_0 = memref.alloc() : memref<1x1x8x16x4x8xbf16, 2 : i32>
      %alloc_1 = memref.alloc() : memref<1x2x64x64xbf16, 1 : i32>
      %tile = amdaie.tile(%c0, %c1)
      %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<1x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x64x64xbf16, 1 : i32>>
      %alloc_2 = memref.alloc() : memref<2x1x64x64xbf16, 1 : i32>
      %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<2x1x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x64x64xbf16, 1 : i32>>
      %alloc_3 = memref.alloc() : memref<1x1x16x16x4x4xbf16, 2 : i32>
      %alloc_4 = memref.alloc() : memref<2x2x64x64xbf16, 1 : i32>
      %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<2x2x64x64xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x64x64xbf16, 1 : i32>>
      %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xbf16>
      %tile_5 = amdaie.tile(%c0, %c0)
      %4 = amdaie.logicalobjectfifo.from_memref %3, {%tile_5} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo