yzhang93/gist:48b5a4fb60dda465158f9dabbab14d91 Secret

## gistfile1.txt
// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
    %5 = tensor.empty() : tensor<128x128xi32>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
    %7 = linalg.matmul ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
    return
  }
}

// -----// IR Dump After AMDAIELoweringStrategy (iree-amdaie-lowering-strategy) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
    %5 = tensor.empty() : tensor<128x128xi32>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
    %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
    return
  }
}

// -----// IR Dump After LowerExecutableUsingTransformDialect (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
    %5 = tensor.empty() : tensor<128x128xi32>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
    %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
    return
  }
}

// -----// IR Dump After AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
  %8 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xi32>, tensor<256x64xi32>) outs(%9 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xi32>, tensor<256x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xi32>, tensor<256x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xi32>, tensor<256x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %9 = tensor.empty() : tensor<8x2x32x32xi32>
    %10 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %11 = tensor.empty() : tensor<2x2x32x32xi32>
    %pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<64x64xi32> -> tensor<2x2x32x32xi32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xi32>, tensor<8x2x32x32xi32>) outs(%pack_3 : tensor<2x2x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_4: i32, %out: i32):
      %13 = arith.muli %in, %in_4 : i32
      %14 = arith.addi %out, %13 : i32
      linalg.yield %14 : i32
    } -> tensor<2x2x32x32xi32>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEPropagateDataLayout (iree-amdaie-propagate-data-layout) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %9 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %10 = tensor.empty() : tensor<2x2x32x32xi32>
    %pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x64xi32> -> tensor<2x2x32x32xi32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xi32>, tensor<8x2x32x32xi32>) outs(%pack_3 : tensor<2x2x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_4: i32, %out: i32):
      %12 = arith.muli %in, %in_4 : i32
      %13 = arith.addi %out, %12 : i32
      linalg.yield %13 : i32
    } -> tensor<2x2x32x32xi32>
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %9 = tensor.empty() : tensor<2x2x32x32xi32>
    %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<2x2x32x32xi32>) -> tensor<2x2x32x32xi32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xi32>, tensor<8x2x32x32xi32>) outs(%10 : tensor<2x2x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_3: i32, %out: i32):
      %12 = arith.muli %in, %in_3 : i32
      %13 = arith.addi %out, %12 : i32
      linalg.yield %13 : i32
    } -> tensor<2x2x32x32xi32>
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %9 = tensor.empty() : tensor<2x2x32x32xi32>
    %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<2x2x32x32xi32>) -> tensor<2x2x32x32xi32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xi32>, tensor<8x2x32x32xi32>) outs(%10 : tensor<2x2x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_3: i32, %out: i32):
      %12 = arith.muli %in, %in_3 : i32
      %13 = arith.addi %out, %12 : i32
      linalg.yield %13 : i32
    } -> tensor<2x2x32x32xi32>
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %9 = tensor.empty() : tensor<2x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<2x2x32x32xi32>) -> tensor<2x2x32x32xi32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xi32>, tensor<8x2x32x32xi32>) outs(%11 : tensor<2x2x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_3: i32, %out: i32):
      %13 = arith.muli %in, %in_3 : i32
      %14 = arith.addi %out, %13 : i32
      linalg.yield %14 : i32
    } -> tensor<2x2x32x32xi32>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %9 = tensor.empty() : tensor<2x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<2x2x32x32xi32>) -> tensor<2x2x32x32xi32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xi32>, tensor<8x2x32x32xi32>) outs(%11 : tensor<2x2x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_3: i32, %out: i32):
      %13 = arith.muli %in, %in_3 : i32
      %14 = arith.addi %out, %13 : i32
      linalg.yield %14 : i32
    } -> tensor<2x2x32x32xi32>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %9 = tensor.empty() : tensor<2x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<2x2x32x32xi32>) -> tensor<2x2x32x32xi32>
    %12 = tensor.empty() : tensor<2x8x8x4x4x8xi32>
    %13 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %14 = tensor.empty() : tensor<8x2x4x8x4x8xi32>
    %15 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %16 = tensor.empty() : tensor<2x2x8x8x4x4xi32>
    %17 = tensor.empty() : tensor<2x2x8x8x4x4xi32>
    %pack_5 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %17 : tensor<2x2x32x32xi32> -> tensor<2x2x8x8x4x4xi32>
    %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xi32>, tensor<8x2x8x4x8x4xi32>) outs(%pack_5 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_7: i32, %out: i32):
      %19 = arith.muli %in, %in_7 : i32
      %20 = arith.addi %out, %19 : i32
      linalg.yield %20 : i32
    } -> tensor<2x2x8x8x4x4xi32>
    %unpack = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEPropagateDataLayout (iree-amdaie-propagate-data-layout) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<2x2x32x32xi32>) -> tensor<2x2x32x32xi32>
    %11 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %12 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %13 = tensor.empty() : tensor<2x2x8x8x4x4xi32>
    %pack_5 = tensor.pack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %13 : tensor<2x2x32x32xi32> -> tensor<2x2x8x8x4x4xi32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xi32>, tensor<8x2x8x4x8x4xi32>) outs(%pack_5 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_7: i32, %out: i32):
      %15 = arith.muli %in, %in_7 : i32
      %16 = arith.addi %out, %15 : i32
      linalg.yield %16 : i32
    } -> tensor<2x2x8x8x4x4xi32>
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %10 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %12 = tensor.empty() : tensor<2x2x8x8x4x4xi32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xi32>, tensor<8x2x8x4x8x4xi32>) outs(%13 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_6: i32, %out: i32):
      %15 = arith.muli %in, %in_6 : i32
      %16 = arith.addi %out, %15 : i32
      linalg.yield %16 : i32
    } -> tensor<2x2x8x8x4x4xi32>
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_5 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_5 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %12 = tensor.empty() : tensor<2x2x8x8x4x4xi32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xi32>, tensor<8x2x8x4x8x4xi32>) outs(%13 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_6: i32, %out: i32):
      %15 = arith.muli %in, %in_6 : i32
      %16 = arith.addi %out, %15 : i32
      linalg.yield %16 : i32
    } -> tensor<2x2x8x8x4x4xi32>
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_5 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_5 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %12 = tensor.empty() : tensor<2x2x8x8x4x4xi32>
    %alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xi32>, tensor<8x2x8x4x8x4xi32>) outs(%14 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_7: i32, %out: i32):
      %16 = arith.muli %in, %in_7 : i32
      %17 = arith.addi %out, %16 : i32
      linalg.yield %17 : i32
    } -> tensor<2x2x8x8x4x4xi32>
    %unpack = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %12 = tensor.empty() : tensor<2x2x8x8x4x4xi32>
    %alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xi32>, tensor<8x2x8x4x8x4xi32>) outs(%14 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_8: i32, %out: i32):
      %17 = arith.muli %in, %in_8 : i32
      %18 = arith.addi %out, %17 : i32
      linalg.yield %18 : i32
    } -> tensor<2x2x8x8x4x4xi32>
    %c0_6 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %16 = scf.for %arg3 = %c0_6 to %c8 step %c1 iter_args(%arg4 = %14) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_8 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %extracted_slice_9 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %extracted_slice_10 = tensor.extract_slice %arg4[0, 0, 0, 0, 0, 0] [2, 2, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<2x2x8x8x4x4xi32>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_8, %extracted_slice_9 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%extracted_slice_10 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_11: i32, %out: i32):
        %18 = arith.muli %in, %in_11 : i32
        %19 = arith.addi %out, %18 : i32
        linalg.yield %19 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      %inserted_slice = tensor.insert_slice %17 into %arg4[0, 0, 0, 0, 0, 0] [2, 2, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      scf.yield %inserted_slice : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_9: i32, %out: i32):
        %16 = arith.muli %in, %in_9 : i32
        %17 = arith.addi %out, %16 : i32
        linalg.yield %17 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      scf.yield %15 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_9: i32, %out: i32):
        %16 = arith.muli %in, %in_9 : i32
        %17 = arith.addi %out, %16 : i32
        linalg.yield %17 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      scf.yield %15 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_9: i32, %out: i32):
        %16 = arith.muli %in, %in_9 : i32
        %17 = arith.addi %out, %16 : i32
        linalg.yield %17 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      scf.yield %15 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xi32> -> tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xi32> -> tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xi32> -> tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xi32> -> tensor<8x2x8x4x8x4xi32>
    %alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %extracted_slice_8 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xi32> to tensor<2x1x32x32xi32>
      %pack_9 = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_8 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_10 = tensor.extract_slice %pack[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xi32> to tensor<2x1x32x32xi32>
      %extracted_slice_11 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %pack_12 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_13 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice_0[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %extracted_slice_15 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xi32> to tensor<1x2x32x32xi32>
      %pack_16 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %extracted_slice_17 = tensor.extract_slice %pack_2[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xi32> to tensor<1x2x32x32xi32>
      %extracted_slice_18 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %pack_19 = tensor.pack %pack_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_18 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %extracted_slice_20 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_19 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_21: i32, %out: i32):
        %18 = arith.muli %in, %in_21 : i32
        %19 = arith.addi %out, %18 : i32
        linalg.yield %19 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      scf.yield %17 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xi32> to tensor<2x1x32x32xi32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_6 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %extracted_slice_9 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xi32> to tensor<1x2x32x32xi32>
      %pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_9 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %extracted_slice_11 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_7, %pack_12 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_13: i32, %out: i32):
        %18 = arith.muli %in, %in_13 : i32
        %19 = arith.addi %out, %18 : i32
        linalg.yield %19 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      scf.yield %17 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xi32> to tensor<2x1x32x32xi32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_6 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%15, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %extracted_slice_9 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xi32> to tensor<1x2x32x32xi32>
      %pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_9 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %extracted_slice_11 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_7, %pack_12 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_13: i32, %out: i32):
        %17 = arith.muli %in, %in_13 : i32
        %18 = arith.addi %out, %17 : i32
        linalg.yield %18 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xi32> to tensor<2x1x32x32xi32>
      %alloc_6 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %16 = bufferization.to_tensor %alloc_6 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_7 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %pack_8 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_7 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_0[%15, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %extracted_slice_10 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xi32> to tensor<1x2x32x32xi32>
      %alloc_11 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %extracted_slice_13 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %pack_14 = tensor.pack %pack_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_14 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_15: i32, %out: i32):
        %19 = arith.muli %in, %in_15 : i32
        %20 = arith.addi %out, %19 : i32
        linalg.yield %20 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      memref.dealloc %alloc_6 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_11 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %18 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = tensor.empty() : tensor<2x8x32x32xi32>
    %8 = tensor.empty() : tensor<8x2x32x32xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %10 = tensor.empty() : tensor<2x8x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<8x2x8x4x8x4xi32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xi32> to tensor<2x1x32x32xi32>
      %alloc_6 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %16 = bufferization.to_tensor %alloc_6 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_7 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xi32> to tensor<2x1x4x8x4x8xi32>
      %pack_8 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_7 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_0[%15, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %extracted_slice_10 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xi32> to tensor<1x2x32x32xi32>
      %alloc_11 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %extracted_slice_13 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xi32> to tensor<1x2x8x4x8x4xi32>
      %pack_14 = tensor.pack %pack_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_14 : tensor<2x1x4x8x4x8xi32>, tensor<1x2x8x4x8x4xi32>) outs(%arg4 : tensor<2x2x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_15: i32, %out: i32):
        %20 = arith.muli %in, %in_15 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<2x2x8x8x4x4xi32>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_15 = tensor.extract_slice %pack_8[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_16 = tensor.extract_slice %pack_14[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_15, %extracted_slice_16 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_18: i32, %out: i32):
          %21 = arith.muli %in, %in_18 : i32
          %22 = arith.addi %out, %21 : i32
          linalg.yield %22 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_6 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_11 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %19 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %alloc_8 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_13 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %18 = arith.muli %in, %in_14 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %alloc_8 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_13 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %18 = arith.muli %in, %in_14 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %alloc_8 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_13 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %18 = arith.muli %in, %in_14 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xi32> -> tensor<2x1x4x8x4x8xi32>
      %extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %alloc_8 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xi32> -> tensor<1x2x8x4x8x4xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_12 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %pack_13 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_12 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_14 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_15 = tensor.extract_slice %pack_9[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_16 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %pack_17 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_18 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %extracted_slice_19 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_20: i32, %out: i32):
          %18 = arith.muli %in, %in_20 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_8 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %alloc_7 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %pack_11 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_10 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_12 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_13 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_11, %pack_14 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_16: i32, %out: i32):
          %18 = arith.muli %in, %in_16 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_7 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %alloc_7 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %pack_11 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_10 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_12 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_13 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_11, %pack_14 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_16: i32, %out: i32):
          %18 = arith.muli %in, %in_16 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_7 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %alloc_5 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
      %14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %alloc_7 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
      %15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        %17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_13 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_14 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %alloc_15 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        %18 = bufferization.to_tensor %alloc_15 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_16 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_16 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_18: i32, %out: i32):
          %20 = arith.muli %in, %in_18 : i32
          %21 = arith.addi %out, %20 : i32
          linalg.yield %21 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_15 : memref<1x1x8x4x8x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      memref.dealloc %alloc_5 : memref<2x1x32x32xi32, 1 : i32>
      memref.dealloc %alloc_7 : memref<1x2x32x32xi32, 1 : i32>
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xi32, 2 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  return
}

// -----// IR Dump After HoistStaticallyBoundAllocations (iree-hoist-statically-bound-allocations) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %c1 = arith.constant 1 : index
  %c8 = arith.constant 8 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = tensor.empty() : tensor<2x1x4x8x4x8xi32>
    %11 = tensor.empty() : tensor<1x2x8x4x8x4xi32>
    %12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_12 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_14 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_15 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_16 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_16 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_18: i32, %out: i32):
          %20 = arith.muli %in, %in_18 : i32
          %21 = arith.addi %out, %20 : i32
          linalg.yield %21 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %11 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %11] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %12 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%11, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %13 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %14 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %15 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_12 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_13 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %16 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_14 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_16: i32, %out: i32):
          %18 = arith.muli %in, %in_16 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %14 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %10 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %11 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %11] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %12 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%11, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %13 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %14 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %15 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_12 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_13 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %16 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_14 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_16: i32, %out: i32):
          %18 = arith.muli %in, %in_16 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %14 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIEPeelForLoop (iree-amdaie-peel-for-loop) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %c1_7 = arith.constant 1 : index
    %10 = scf.for %arg3 = %c0 to %c1_7 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_17: i32, %out: i32):
          %20 = arith.muli %in, %in_17 : i32
          %21 = arith.addi %out, %20 : i32
          linalg.yield %21 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %c7 = arith.constant 7 : index
    %11 = scf.for %arg3 = %c1_7 to %c7 step %c1 iter_args(%arg4 = %10) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_17: i32, %out: i32):
          %20 = arith.muli %in, %in_17 : i32
          %21 = arith.addi %out, %20 : i32
          linalg.yield %21 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %12 = scf.for %arg3 = %c7 to %c8 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xi32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_17: i32, %out: i32):
          %20 = arith.muli %in, %in_17 : i32
          %21 = arith.addi %out, %20 : i32
          linalg.yield %21 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %16 : tensor<2x2x8x8x4x4xi32>
    }
    %unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_8 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c7 = arith.constant 7 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %9) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_17 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_20: i32, %out: i32):
        %20 = arith.muli %in, %in_20 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xi32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_17 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_20, %pack_22 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_23 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_24: i32, %out: i32):
          %24 = arith.muli %in, %in_24 : i32
          %25 = arith.addi %out, %24 : i32
          linalg.yield %25 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %16 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_15 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_17 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_20: i32, %out: i32):
        %20 = arith.muli %in, %in_20 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_14 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_14 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIEFuseFillIntoForall (iree-amdaie-fuse-fill-into-forall) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c7 = arith.constant 7 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_17 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %extracted_slice_20 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_21: i32, %out: i32):
        %21 = arith.muli %in, %in_21 : i32
        %22 = arith.addi %out, %21 : i32
        linalg.yield %22 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xi32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_17 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_20, %pack_22 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_23 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_24: i32, %out: i32):
          %24 = arith.muli %in, %in_24 : i32
          %25 = arith.addi %out, %24 : i32
          linalg.yield %25 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %16 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_15 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_17 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_20: i32, %out: i32):
        %20 = arith.muli %in, %in_20 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xi32> -> tensor<2x2x32x32xi32>
    %unpack_14 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack_14 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIEFuseConsumerIntoLoop (iree-amdaie-fuse-consumer-into-loop) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c7 = arith.constant 7 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_20: i32, %out: i32):
        %21 = arith.muli %in, %in_20 : i32
        %22 = arith.addi %out, %21 : i32
        linalg.yield %22 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xi32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_23: i32, %out: i32):
          %24 = arith.muli %in, %in_23 : i32
          %25 = arith.addi %out, %24 : i32
          linalg.yield %25 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %16:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_22: i32, %out: i32):
        %20 = arith.muli %in, %in_22 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %inserted_slice = tensor.insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %extracted_slice_20 = tensor.extract_slice %inserted_slice[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %unpack_21 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c7 = arith.constant 7 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_20: i32, %out: i32):
        %21 = arith.muli %in, %in_20 : i32
        %22 = arith.addi %out, %21 : i32
        linalg.yield %22 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xi32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_23: i32, %out: i32):
          %24 = arith.muli %in, %in_23 : i32
          %25 = arith.addi %out, %24 : i32
          linalg.yield %25 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %16:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_22: i32, %out: i32):
        %20 = arith.muli %in, %in_22 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %inserted_slice = tensor.insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %extracted_slice_20 = tensor.extract_slice %inserted_slice[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %unpack_21 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c7 = arith.constant 7 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2x2x8x8x4x4xi32>) -> tensor<2x2x8x8x4x4xi32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%19 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_20: i32, %out: i32):
        %21 = arith.muli %in, %in_20 : i32
        %22 = arith.addi %out, %21 : i32
        linalg.yield %22 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xi32>) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_23: i32, %out: i32):
          %24 = arith.muli %in, %in_23 : i32
          %25 = arith.addi %out, %24 : i32
          linalg.yield %25 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %20 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %16:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_22: i32, %out: i32):
        %20 = arith.muli %in, %in_22 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %inserted_slice = tensor.insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %extracted_slice_20 = tensor.extract_slice %inserted_slice[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %unpack_21 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %16#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIELowerToUKernels (iree-amdaie-lower-to-ukernels) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c7 = arith.constant 7 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%18 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_19: i32, %out: i32):
        %20 = arith.muli %in, %in_19 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_23: i32, %out: i32):
          %23 = arith.muli %in, %in_23 : i32
          %24 = arith.addi %out, %23 : i32
          linalg.yield %24 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_21: i32, %out: i32):
        %19 = arith.muli %in, %in_21 : i32
        %20 = arith.addi %out, %19 : i32
        linalg.yield %20 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c7 = arith.constant 7 : index
  %c0 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%18 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_19: i32, %out: i32):
        %20 = arith.muli %in, %in_19 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_23: i32, %out: i32):
          %23 = arith.muli %in, %in_23 : i32
          %24 = arith.addi %out, %23 : i32
          linalg.yield %24 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_21: i32, %out: i32):
        %19 = arith.muli %in, %in_21 : i32
        %20 = arith.addi %out, %19 : i32
        linalg.yield %20 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIEInsertLoopsForVectorization (iree-amdaie-insert-loops-for-vectorization) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c1 = arith.constant 1 : index
  %c1_0 = arith.constant 1 : index
  %c1_1 = arith.constant 1 : index
  %c1_2 = arith.constant 1 : index
  %c1_3 = arith.constant 1 : index
  %c1_4 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c0_5 = arith.constant 0 : index
  %c0_6 = arith.constant 0 : index
  %c1_7 = arith.constant 1 : index
  %c1_8 = arith.constant 1 : index
  %c1_9 = arith.constant 1 : index
  %c1_10 = arith.constant 1 : index
  %c1_11 = arith.constant 1 : index
  %c1_12 = arith.constant 1 : index
  %c0_13 = arith.constant 0 : index
  %c0_14 = arith.constant 0 : index
  %c0_15 = arith.constant 0 : index
  %c1_16 = arith.constant 1 : index
  %c1_17 = arith.constant 1 : index
  %c1_18 = arith.constant 1 : index
  %c1_19 = arith.constant 1 : index
  %c1_20 = arith.constant 1 : index
  %c1_21 = arith.constant 1 : index
  %c0_22 = arith.constant 0 : index
  %c0_23 = arith.constant 0 : index
  %c0_24 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_25 = arith.constant 0 : index
  %c0_i32 = arith.constant 0 : i32
  %c1_26 = arith.constant 1 : index
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_27 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_28 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_29 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_30 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_31 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0_25) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0_25) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0_25) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_32 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_33 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_31 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_30 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %extracted_slice_34 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %9 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_34 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_35 = tensor.extract_slice %extracted_slice_32[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %10 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_36 = tensor.pack %extracted_slice_35 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_41 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_42 = tensor.pack %extracted_slice_41 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_43 = tensor.extract_slice %pack_36[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_45 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_45 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %c0_46 = arith.constant 0 : index
      %c1_47 = arith.constant 1 : index
      %c1_48 = arith.constant 1 : index
      %c0_49 = arith.constant 0 : index
      %c1_50 = arith.constant 1 : index
      %c1_51 = arith.constant 1 : index
      %c0_52 = arith.constant 0 : index
      %c1_53 = arith.constant 1 : index
      %c1_54 = arith.constant 1 : index
      %c0_55 = arith.constant 0 : index
      %c8 = arith.constant 8 : index
      %c1_56 = arith.constant 1 : index
      %c0_57 = arith.constant 0 : index
      %c8_58 = arith.constant 8 : index
      %c1_59 = arith.constant 1 : index
      %c0_60 = arith.constant 0 : index
      %c4 = arith.constant 4 : index
      %c1_61 = arith.constant 1 : index
      %19 = scf.for %arg6 = %c0_46 to %c1_47 step %c1_48 iter_args(%arg7 = %18) -> (tensor<1x1x8x8x4x4xi32>) {
        %20 = scf.for %arg8 = %c0_49 to %c1_50 step %c1_51 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xi32>) {
          %21 = scf.for %arg10 = %c0_52 to %c1_53 step %c1_54 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %22 = scf.for %arg12 = %c0_55 to %c8 step %c1_56 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %23 = scf.for %arg14 = %c0_57 to %c8_58 step %c1_59 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %24 = scf.for %arg16 = %c0_60 to %c4 step %c1_61 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_62 = tensor.extract_slice %pack_42[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_63 = tensor.extract_slice %pack_44[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_64 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_62, %extracted_slice_63 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_64 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_65: i32, %out: i32):
                    %26 = arith.muli %in, %in_65 : i32
                    %27 = arith.addi %out, %26 : i32
                    linalg.yield %27 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %25 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %24 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %23 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %22 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %21 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %20 : tensor<1x1x8x8x4x4xi32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1_26 to %c7 step %c1_26 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_41 = tensor.extract_slice %extracted_slice[0, %16] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %17 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_42 = tensor.pack %extracted_slice_41 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_43 = tensor.extract_slice %extracted_slice_32[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %18 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_45 = tensor.extract_slice %pack_42[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %20 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_46 = tensor.pack %extracted_slice_45 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_47 = tensor.extract_slice %pack_44[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_48 = tensor.pack %extracted_slice_47 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_49 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %c0_50 = arith.constant 0 : index
        %c1_51 = arith.constant 1 : index
        %c1_52 = arith.constant 1 : index
        %c0_53 = arith.constant 0 : index
        %c1_54 = arith.constant 1 : index
        %c1_55 = arith.constant 1 : index
        %c0_56 = arith.constant 0 : index
        %c1_57 = arith.constant 1 : index
        %c1_58 = arith.constant 1 : index
        %c0_59 = arith.constant 0 : index
        %c8 = arith.constant 8 : index
        %c1_60 = arith.constant 1 : index
        %c0_61 = arith.constant 0 : index
        %c8_62 = arith.constant 8 : index
        %c1_63 = arith.constant 1 : index
        %c0_64 = arith.constant 0 : index
        %c4 = arith.constant 4 : index
        %c1_65 = arith.constant 1 : index
        %22 = scf.for %arg8 = %c0_50 to %c1_51 step %c1_52 iter_args(%arg9 = %extracted_slice_49) -> (tensor<1x1x8x8x4x4xi32>) {
          %23 = scf.for %arg10 = %c0_53 to %c1_54 step %c1_55 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %24 = scf.for %arg12 = %c0_56 to %c1_57 step %c1_58 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %25 = scf.for %arg14 = %c0_59 to %c8 step %c1_60 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %26 = scf.for %arg16 = %c0_61 to %c8_62 step %c1_63 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %27 = scf.for %arg18 = %c0_64 to %c4 step %c1_65 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xi32>) {
                    %extracted_slice_66 = tensor.extract_slice %pack_46[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                    %extracted_slice_67 = tensor.extract_slice %pack_48[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                    %extracted_slice_68 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                    %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_66, %extracted_slice_67 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_68 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                    ^bb0(%in: i32, %in_69: i32, %out: i32):
                      %29 = arith.muli %in, %in_69 : i32
                      %30 = arith.addi %out, %29 : i32
                      linalg.yield %30 : i32
                    } -> tensor<1x1x1x1x4x4xi32>
                    %inserted_slice = tensor.insert_slice %28 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                    scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                  }
                  scf.yield %27 : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %26 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %25 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %24 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %23 : tensor<1x1x8x8x4x4xi32>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_37 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %13 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_38 = tensor.pack %extracted_slice_37 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_39 = tensor.extract_slice %extracted_slice_32[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %14 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_40 = tensor.pack %extracted_slice_39 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_41 = tensor.extract_slice %pack_38[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_42 = tensor.pack %extracted_slice_41 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_43 = tensor.extract_slice %pack_40[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_45 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %c0_46 = arith.constant 0 : index
      %c1_47 = arith.constant 1 : index
      %c1_48 = arith.constant 1 : index
      %c0_49 = arith.constant 0 : index
      %c1_50 = arith.constant 1 : index
      %c1_51 = arith.constant 1 : index
      %c0_52 = arith.constant 0 : index
      %c1_53 = arith.constant 1 : index
      %c1_54 = arith.constant 1 : index
      %c0_55 = arith.constant 0 : index
      %c8 = arith.constant 8 : index
      %c1_56 = arith.constant 1 : index
      %c0_57 = arith.constant 0 : index
      %c8_58 = arith.constant 8 : index
      %c1_59 = arith.constant 1 : index
      %c0_60 = arith.constant 0 : index
      %c4 = arith.constant 4 : index
      %c1_61 = arith.constant 1 : index
      %18 = scf.for %arg7 = %c0_46 to %c1_47 step %c1_48 iter_args(%arg8 = %extracted_slice_45) -> (tensor<1x1x8x8x4x4xi32>) {
        %19 = scf.for %arg9 = %c0_49 to %c1_50 step %c1_51 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xi32>) {
          %20 = scf.for %arg11 = %c0_52 to %c1_53 step %c1_54 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xi32>) {
            %21 = scf.for %arg13 = %c0_55 to %c8 step %c1_56 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xi32>) {
              %22 = scf.for %arg15 = %c0_57 to %c8_58 step %c1_59 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xi32>) {
                %23 = scf.for %arg17 = %c0_60 to %c4 step %c1_61 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_64 = tensor.extract_slice %pack_42[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_65 = tensor.extract_slice %pack_44[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_66 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_64, %extracted_slice_65 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_66 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_67: i32, %out: i32):
                    %25 = arith.muli %in, %in_67 : i32
                    %26 = arith.addi %out, %25 : i32
                    linalg.yield %26 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %24 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %23 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %22 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %21 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %20 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %19 : tensor<1x1x8x8x4x4xi32>
      }
      %extracted_slice_62 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %unpack_63 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_62 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_63 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_33 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_31 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_30 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_29 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_28 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_27 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIEVectorization (iree-amdaie-vectorization) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = tensor.empty() : tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %19 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %18) -> (tensor<1x1x8x8x4x4xi32>) {
        %20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xi32>) {
          %21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %22 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %23 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %24 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_19 = tensor.extract_slice %pack_15[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_20 = tensor.extract_slice %pack_17[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_21 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_19, %extracted_slice_20 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_21 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_22: i32, %out: i32):
                    %26 = arith.muli %in, %in_22 : i32
                    %27 = arith.addi %out, %26 : i32
                    linalg.yield %27 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %25 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %24 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %23 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %22 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %21 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %20 : tensor<1x1x8x8x4x4xi32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %22 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_22) -> (tensor<1x1x8x8x4x4xi32>) {
          %23 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %24 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %25 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %26 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %27 = scf.for %arg18 = %c0 to %c4 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xi32>) {
                    %extracted_slice_23 = tensor.extract_slice %pack_19[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                    %extracted_slice_24 = tensor.extract_slice %pack_21[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                    %extracted_slice_25 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                    %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_23, %extracted_slice_24 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_25 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                    ^bb0(%in: i32, %in_26: i32, %out: i32):
                      %29 = arith.muli %in, %in_26 : i32
                      %30 = arith.addi %out, %29 : i32
                      linalg.yield %30 : i32
                    } -> tensor<1x1x1x1x4x4xi32>
                    %inserted_slice = tensor.insert_slice %28 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                    scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                  }
                  scf.yield %27 : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %26 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %25 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %24 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %23 : tensor<1x1x8x8x4x4xi32>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_18) -> (tensor<1x1x8x8x4x4xi32>) {
        %19 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xi32>) {
          %20 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xi32>) {
            %21 = scf.for %arg13 = %c0 to %c8 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xi32>) {
              %22 = scf.for %arg15 = %c0 to %c8 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xi32>) {
                %23 = scf.for %arg17 = %c0 to %c4 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_21 = tensor.extract_slice %pack_15[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_22 = tensor.extract_slice %pack_17[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_23 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_23 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_24: i32, %out: i32):
                    %25 = arith.muli %in, %in_24 : i32
                    %26 = arith.addi %out, %25 : i32
                    linalg.yield %26 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %24 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %23 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %22 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %21 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %20 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %19 : tensor<1x1x8x8x4x4xi32>
      }
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>> -> tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %19 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %18) -> (tensor<1x1x8x8x4x4xi32>) {
        %20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xi32>) {
          %21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %22 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %23 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %24 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_19 = tensor.extract_slice %pack_15[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_20 = tensor.extract_slice %pack_17[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_21 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_19, %extracted_slice_20 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_21 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_22: i32, %out: i32):
                    %26 = arith.muli %in, %in_22 : i32
                    %27 = arith.addi %out, %26 : i32
                    linalg.yield %27 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %25 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %24 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %23 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %22 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %21 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %20 : tensor<1x1x8x8x4x4xi32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %22 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_22) -> (tensor<1x1x8x8x4x4xi32>) {
          %23 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %24 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %25 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %26 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %27 = scf.for %arg18 = %c0 to %c4 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xi32>) {
                    %extracted_slice_23 = tensor.extract_slice %pack_19[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                    %extracted_slice_24 = tensor.extract_slice %pack_21[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                    %extracted_slice_25 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                    %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_23, %extracted_slice_24 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_25 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                    ^bb0(%in: i32, %in_26: i32, %out: i32):
                      %29 = arith.muli %in, %in_26 : i32
                      %30 = arith.addi %out, %29 : i32
                      linalg.yield %30 : i32
                    } -> tensor<1x1x1x1x4x4xi32>
                    %inserted_slice = tensor.insert_slice %28 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                    scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                  }
                  scf.yield %27 : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %26 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %25 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %24 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %23 : tensor<1x1x8x8x4x4xi32>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_18) -> (tensor<1x1x8x8x4x4xi32>) {
        %19 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xi32>) {
          %20 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xi32>) {
            %21 = scf.for %arg13 = %c0 to %c8 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xi32>) {
              %22 = scf.for %arg15 = %c0 to %c8 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xi32>) {
                %23 = scf.for %arg17 = %c0 to %c4 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_21 = tensor.extract_slice %pack_15[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_22 = tensor.extract_slice %pack_17[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_23 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_23 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_24: i32, %out: i32):
                    %25 = arith.muli %in, %in_24 : i32
                    %26 = arith.addi %out, %25 : i32
                    linalg.yield %26 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %24 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %23 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %22 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %21 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %20 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %19 : tensor<1x1x8x8x4x4xi32>
      }
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>> -> tensor<128x128xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xi32> to tensor<64x256xi32>
    %extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xi32> to tensor<256x64xi32>
    %extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xi32> to tensor<64x64xi32>
    %7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xi32, 2 : i32>
    %extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %19 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %18) -> (tensor<1x1x8x8x4x4xi32>) {
        %20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xi32>) {
          %21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %22 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %23 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %24 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_19 = tensor.extract_slice %pack_15[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_20 = tensor.extract_slice %pack_17[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_21 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_19, %extracted_slice_20 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_21 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_22: i32, %out: i32):
                    %26 = arith.muli %in, %in_22 : i32
                    %27 = arith.addi %out, %26 : i32
                    linalg.yield %27 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %25 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %24 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %23 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %22 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %21 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %20 : tensor<1x1x8x8x4x4xi32>
      }
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xi32>) {
      %16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
      %extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %16] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
      %17 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
      %extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%16, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
      %18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
      %19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xi32>) {
        %extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
        %20 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %20 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
        %21 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %21 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %22 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_22) -> (tensor<1x1x8x8x4x4xi32>) {
          %23 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xi32>) {
            %24 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xi32>) {
              %25 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xi32>) {
                %26 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xi32>) {
                  %27 = scf.for %arg18 = %c0 to %c4 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xi32>) {
                    %extracted_slice_23 = tensor.extract_slice %pack_19[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                    %extracted_slice_24 = tensor.extract_slice %pack_21[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                    %extracted_slice_25 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                    %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_23, %extracted_slice_24 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_25 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                    ^bb0(%in: i32, %in_26: i32, %out: i32):
                      %29 = arith.muli %in, %in_26 : i32
                      %30 = arith.addi %out, %29 : i32
                      linalg.yield %30 : i32
                    } -> tensor<1x1x1x1x4x4xi32>
                    %inserted_slice = tensor.insert_slice %28 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                    scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                  }
                  scf.yield %27 : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %26 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %25 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %24 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %23 : tensor<1x1x8x8x4x4xi32>
        }
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %22 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %19 : tensor<2x2x8x8x4x4xi32>
    }
    %extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xi32> to tensor<64x32xi32>
    %13 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xi32, 1 : i32>
    %pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<64x32xi32> -> tensor<2x1x32x32xi32>
    %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xi32> to tensor<32x64xi32>
    %14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xi32, 1 : i32>
    %pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<32x64xi32> -> tensor<1x2x32x32xi32>
    %15:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xi32>, tensor<2x2x32x32xi32>) {
      %extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xi32> to tensor<1x1x32x32xi32>
      %16 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
      %pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
      %extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xi32> to tensor<1x1x32x32xi32>
      %17 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
      %pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
      %extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
      %18 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_18) -> (tensor<1x1x8x8x4x4xi32>) {
        %19 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xi32>) {
          %20 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xi32>) {
            %21 = scf.for %arg13 = %c0 to %c8 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xi32>) {
              %22 = scf.for %arg15 = %c0 to %c8 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xi32>) {
                %23 = scf.for %arg17 = %c0 to %c4 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xi32>) {
                  %extracted_slice_21 = tensor.extract_slice %pack_15[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xi32> to tensor<1x1x1x1x4x8xi32>
                  %extracted_slice_22 = tensor.extract_slice %pack_17[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xi32> to tensor<1x1x1x1x8x4xi32>
                  %extracted_slice_23 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x1x1x4x4xi32>
                  %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_21, %extracted_slice_22 : tensor<1x1x1x1x4x8xi32>, tensor<1x1x1x1x8x4xi32>) outs(%extracted_slice_23 : tensor<1x1x1x1x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_24: i32, %out: i32):
                    %25 = arith.muli %in, %in_24 : i32
                    %26 = arith.addi %out, %25 : i32
                    linalg.yield %26 : i32
                  } -> tensor<1x1x1x1x4x4xi32>
                  %inserted_slice = tensor.insert_slice %24 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xi32> into tensor<1x1x8x8x4x4xi32>
                  scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
                }
                scf.yield %23 : tensor<1x1x8x8x4x4xi32>
              }
              scf.yield %22 : tensor<1x1x8x8x4x4xi32>
            }
            scf.yield %21 : tensor<1x1x8x8x4x4xi32>
          }
          scf.yield %20 : tensor<1x1x8x8x4x4xi32>
        }
        scf.yield %19 : tensor<1x1x8x8x4x4xi32>
      }
      %extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xi32> to tensor<1x1x32x32xi32>
      %unpack_20 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<2x2x32x32xi32>
        tensor.parallel_insert_slice %18 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<2x2x8x8x4x4xi32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %unpack = tensor.unpack %15#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<128x128xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_14 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_16 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_17 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_18 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_20: i32, %out: i32):
                    %10 = arith.muli %in, %in_20 : i32
                    %11 = arith.addi %out, %10 : i32
                    linalg.yield %11 : i32
                  }
                  %subview_19 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_19 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: i32, %out: i32):
                    linalg.yield %in : i32
                  }
                  scf.yield %arg15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %3 = scf.for %arg2 = %c1 to %c7 step %c1 iter_args(%arg3 = %alloc_3) -> (memref<2x2x8x8x4x4xi32, 2 : i32>) {
      %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_12 = memref.subview %subview[0, %4] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_13 = memref.subview %subview_5[%4, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg4, %arg5) in (2, 2) {
        %subview_14 = memref.subview %alloc_2[%arg4, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_1[0, %arg5, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_16 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %subview_16) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %10 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                    %subview_18 = memref.subview %alloc_0[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                    %subview_19 = memref.subview %alloc[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                    %subview_20 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_18, %subview_19 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                    ^bb0(%in: i32, %in_22: i32, %out: i32):
                      %11 = arith.muli %in, %in_22 : i32
                      %12 = arith.addi %out, %11 : i32
                      linalg.yield %12 : i32
                    }
                    %subview_21 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                    ^bb0(%in: i32, %out: i32):
                      linalg.yield %in : i32
                    }
                    scf.yield %arg17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  }
                  scf.yield %10 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        }
        %subview_17 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %arg3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_14 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_18 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_19 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_20 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_18, %subview_19 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_22: i32, %out: i32):
                    %10 = arith.muli %in, %in_22 : i32
                    %11 = arith.addi %out, %10 : i32
                    linalg.yield %11 : i32
                  }
                  %subview_21 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: i32, %out: i32):
                    linalg.yield %in : i32
                  }
                  scf.yield %arg15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
      %subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_16 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
      %subview_17 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_11 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_11 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  }
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_14 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_16 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_17 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_18 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_20: i32, %out: i32):
                    %10 = arith.muli %in, %in_20 : i32
                    %11 = arith.addi %out, %10 : i32
                    linalg.yield %11 : i32
                  }
                  %subview_19 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_19 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: i32, %out: i32):
                    linalg.yield %in : i32
                  }
                  scf.yield %arg15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %3 = scf.for %arg2 = %c1 to %c7 step %c1 iter_args(%arg3 = %alloc_3) -> (memref<2x2x8x8x4x4xi32, 2 : i32>) {
      %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_12 = memref.subview %subview[0, %4] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_13 = memref.subview %subview_5[%4, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg4, %arg5) in (2, 2) {
        %subview_14 = memref.subview %alloc_2[%arg4, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_1[0, %arg5, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_16 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %subview_16) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %10 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                    %subview_18 = memref.subview %alloc_0[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                    %subview_19 = memref.subview %alloc[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                    %subview_20 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_18, %subview_19 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                    ^bb0(%in: i32, %in_22: i32, %out: i32):
                      %11 = arith.muli %in, %in_22 : i32
                      %12 = arith.addi %out, %11 : i32
                      linalg.yield %12 : i32
                    }
                    %subview_21 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                    ^bb0(%in: i32, %out: i32):
                      linalg.yield %in : i32
                    }
                    scf.yield %arg17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  }
                  scf.yield %10 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        }
        %subview_17 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %arg3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_14 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      %4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_14) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        %5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
          %6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            %7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              %8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                %9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  %subview_18 = memref.subview %alloc_0[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_19 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_20 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_18, %subview_19 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_22: i32, %out: i32):
                    %10 = arith.muli %in, %in_22 : i32
                    %11 = arith.addi %out, %10 : i32
                    linalg.yield %11 : i32
                  }
                  %subview_21 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
                  ^bb0(%in: i32, %out: i32):
                    linalg.yield %in : i32
                  }
                  scf.yield %arg15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                }
                scf.yield %9 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              }
              scf.yield %8 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            }
            scf.yield %7 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          }
          scf.yield %6 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        }
        scf.yield %5 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      }
      %subview_15 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
      %subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_16 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
      %subview_17 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_11 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_11 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  }
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_14 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_16 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_18 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_20: i32, %out: i32):
              %3 = arith.muli %in, %in_20 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
            %subview_19 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_19 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: i32, %out: i32):
              linalg.yield %in : i32
            }
          }
        }
      }
      %subview_15 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_14 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c7 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_12 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_13 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_14 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_16 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            scf.for %arg7 = %c0 to %c4 step %c1 {
              %subview_18 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_19 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_20 = memref.subview %subview_16[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_18, %subview_19 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_22: i32, %out: i32):
                %4 = arith.muli %in, %in_22 : i32
                %5 = arith.addi %out, %4 : i32
                linalg.yield %5 : i32
              }
              %subview_21 = memref.subview %subview_16[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              ^bb0(%in: i32, %out: i32):
                linalg.yield %in : i32
              }
            }
          }
        }
        %subview_17 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_16 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_12 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_14 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_18 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_19 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_20 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_18, %subview_19 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_22: i32, %out: i32):
              %3 = arith.muli %in, %in_22 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
            %subview_21 = memref.subview %subview_14[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: i32, %out: i32):
              linalg.yield %in : i32
            }
          }
        }
      }
      %subview_15 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
      %subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_16 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
      %subview_17 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_14 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_11 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_11 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_17: i32, %out: i32):
              %3 = arith.muli %in, %in_17 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: i32, %out: i32):
              linalg.yield %in : i32
            }
          }
        }
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c7 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            scf.for %arg7 = %c0 to %c4 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_19: i32, %out: i32):
                %4 = arith.muli %in, %in_19 : i32
                %5 = arith.addi %out, %4 : i32
                linalg.yield %5 : i32
              }
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
              ^bb0(%in: i32, %out: i32):
                linalg.yield %in : i32
              }
            }
          }
        }
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_15, %subview_16 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_18: i32, %out: i32):
              %3 = arith.muli %in, %in_18 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
            ^bb0(%in: i32, %out: i32):
              linalg.yield %in : i32
            }
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_14 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_14 : memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_6 : memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_17: i32, %out: i32):
              %3 = arith.muli %in, %in_17 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c7 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            scf.for %arg7 = %c0 to %c4 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_19: i32, %out: i32):
                %4 = arith.muli %in, %in_19 : i32
                %5 = arith.addi %out, %4 : i32
                linalg.yield %5 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_15, %subview_16 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_18: i32, %out: i32):
              %3 = arith.muli %in, %in_18 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_17: i32, %out: i32):
              %3 = arith.muli %in, %in_17 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c7 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            scf.for %arg7 = %c0 to %c4 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_19: i32, %out: i32):
                %4 = arith.muli %in, %in_19 : i32
                %5 = arith.addi %out, %4 : i32
                linalg.yield %5 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_15, %subview_16 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_18: i32, %out: i32):
              %3 = arith.muli %in, %in_18 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After HoistStaticallyBoundAllocations (iree-hoist-statically-bound-allocations) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_17: i32, %out: i32):
              %3 = arith.muli %in, %in_17 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c7 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            scf.for %arg7 = %c0 to %c4 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_19: i32, %out: i32):
                %4 = arith.muli %in, %in_19 : i32
                %5 = arith.addi %out, %4 : i32
                linalg.yield %5 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_15, %subview_16 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_18: i32, %out: i32):
              %3 = arith.muli %in, %in_18 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After AMDAIELowerExecutableTarget (iree-amdaie-lower-executable-target) //----- //
func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c7 = arith.constant 7 : index
  %c0_i32 = arith.constant 0 : i32
  %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
  %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
  %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
  %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
  %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
  %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_17: i32, %out: i32):
              %3 = arith.muli %in, %in_17 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c1 to %c7 step %c1 {
      %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
      %subview_11 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_12 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg3, %arg4) in (2, 2) {
        %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c8 step %c1 {
            scf.for %arg7 = %c0 to %c4 step %c1 {
              %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_19: i32, %out: i32):
                %4 = arith.muli %in, %in_19 : i32
                %5 = arith.addi %out, %4 : i32
                linalg.yield %5 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
    %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
    scf.forall (%arg2, %arg3) in (2, 2) {
      %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
      %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
      %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
      scf.for %arg4 = %c0 to %c8 step %c1 {
        scf.for %arg5 = %c0 to %c8 step %c1 {
          scf.for %arg6 = %c0 to %c4 step %c1 {
            %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
            %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
            %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_15, %subview_16 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_18: i32, %out: i32):
              %3 = arith.muli %in, %in_18 : i32
              %4 = arith.addi %out, %3 : i32
              linalg.yield %4 : i32
            }
          }
        }
      }
      %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
      iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
  memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
  memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
  memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
  memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
  return
}

// -----// IR Dump After LowerUKernelOpsToCalls (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<128x256xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<256x128xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<128x128xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_17: i32, %out: i32):
                %3 = arith.muli %in, %in_17 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c7 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %subview_11 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
        %subview_12 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              scf.for %arg7 = %c0 to %c4 step %c1 {
                %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_19: i32, %out: i32):
                  %4 = arith.muli %in, %in_19 : i32
                  %5 = arith.addi %out, %4 : i32
                  linalg.yield %5 : i32
                }
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_15, %subview_16 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_18: i32, %out: i32):
                %3 = arith.muli %in, %in_18 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
        %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    memref.assume_alignment %0, 64 : memref<128x256xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    memref.assume_alignment %1, 64 : memref<256x128xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    memref.assume_alignment %2, 64 : memref<128x128xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xi32> to memref<64x256xi32, strided<[256, 1], offset: ?>>
      %subview_5 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xi32> to memref<256x64xi32, strided<[128, 1], offset: ?>>
      %subview_6 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32> to memref<64x64xi32, strided<[128, 1], offset: ?>>
      %subview_7 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>> to memref<64x32xi32, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_8 = memref.subview %subview_5[0, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>> to memref<32x64xi32, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%subview_13 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_14 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_15 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_17: i32, %out: i32):
                %3 = arith.muli %in, %in_17 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c7 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %subview_11 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>> to memref<64x32xi32, strided<[256, 1], offset: ?>>
        iree_linalg_ext.pack %subview_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>> memref<2x1x32x32xi32, 1 : i32>)
        %subview_12 = memref.subview %subview_5[%3, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>> to memref<32x64xi32, strided<[128, 1], offset: ?>>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>> memref<1x2x32x32xi32, 1 : i32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_13 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_14 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          %subview_15 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              scf.for %arg7 = %c0 to %c4 step %c1 {
                %subview_16 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_17 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_18 = memref.subview %subview_15[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_16, %subview_17 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_19: i32, %out: i32):
                  %4 = arith.muli %in, %in_19 : i32
                  %5 = arith.addi %out, %4 : i32
                  linalg.yield %5 : i32
                }
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_9 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xi32, strided<[256, 1], offset: ?>> to memref<64x32xi32, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_10 = memref.subview %subview_5[224, 0] [32, 64] [1, 1] : memref<256x64xi32, strided<[128, 1], offset: ?>> to memref<32x64xi32, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_11 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_13 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_15 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_16 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_17 = memref.subview %subview_13[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_15, %subview_16 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_17 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_18: i32, %out: i32):
                %3 = arith.muli %in, %in_18 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
        %subview_14 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.unpack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_14 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_6 : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    memref.assume_alignment %0, 64 : memref<128x256xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    memref.assume_alignment %1, 64 : memref<256x128xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    memref.assume_alignment %2, 64 : memref<128x128xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
      %subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32> to memref<64x64xi32, strided<[128, 1], offset: ?>>
      %subview_5 = memref.subview %0[%arg0, 0] [64, 32] [1, 1] : memref<128x256xi32> to memref<64x32xi32, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_6 = memref.subview %1[0, %arg1] [32, 64] [1, 1] : memref<256x128xi32> to memref<32x64xi32, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_9 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_10 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_11 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%subview_11 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_12 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_13 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_14 = memref.subview %subview_11[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_12, %subview_13 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_14 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_15: i32, %out: i32):
                %3 = arith.muli %in, %in_15 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c7 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %subview_9 = memref.subview %0[%arg0, %3] [64, 32] [1, 1] : memref<128x256xi32> to memref<64x32xi32, strided<[256, 1], offset: ?>>
        iree_linalg_ext.pack %subview_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>> memref<2x1x32x32xi32, 1 : i32>)
        %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %subview_10 = memref.subview %1[%4, %arg1] [32, 64] [1, 1] : memref<256x128xi32> to memref<32x64xi32, strided<[128, 1], offset: ?>>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>> memref<1x2x32x32xi32, 1 : i32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_11 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_12 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
          iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          %subview_13 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              scf.for %arg7 = %c0 to %c4 step %c1 {
                %subview_14 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_15 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_16 = memref.subview %subview_13[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_14, %subview_15 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_17: i32, %out: i32):
                  %5 = arith.muli %in, %in_17 : i32
                  %6 = arith.addi %out, %5 : i32
                  linalg.yield %6 : i32
                }
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_7 = memref.subview %0[%arg0, 224] [64, 32] [1, 1] : memref<128x256xi32> to memref<64x32xi32, strided<[256, 1], offset: ?>>
      iree_linalg_ext.pack %subview_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<64x32xi32, strided<[256, 1], offset: ?>> memref<2x1x32x32xi32, 1 : i32>)
      %subview_8 = memref.subview %1[224, %arg1] [32, 64] [1, 1] : memref<256x128xi32> to memref<32x64xi32, strided<[128, 1], offset: ?>>
      iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<32x64xi32, strided<[128, 1], offset: ?>> memref<1x2x32x32xi32, 1 : i32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_9 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_0 : (memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_10 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        %subview_11 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_13 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_14 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_15 = memref.subview %subview_11[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_13, %subview_14 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_15 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_16: i32, %out: i32):
                %3 = arith.muli %in, %in_16 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
        %subview_12 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        iree_linalg_ext.unpack %subview_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_12 : (memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      iree_linalg_ext.unpack %alloc_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview : (memref<2x2x32x32xi32, 1 : i32> memref<64x64xi32, strided<[128, 1], offset: ?>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After AMDAIEPackToDma (iree-amdaie-pack-to-dma) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    memref.assume_alignment %0, 64 : memref<128x256xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    memref.assume_alignment %1, 64 : memref<256x128xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    memref.assume_alignment %2, 64 : memref<128x128xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
      %subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32> to memref<64x64xi32, strided<[128, 1], offset: ?>>
      %subview_5 = memref.subview %0[%arg0, 0] [64, 32] [1, 1] : memref<128x256xi32> to memref<64x32xi32, strided<[256, 1], offset: ?>>
      %c0_6 = arith.constant 0 : index
      %c0_7 = arith.constant 0 : index
      %c0_8 = arith.constant 0 : index
      %c0_9 = arith.constant 0 : index
      %c2 = arith.constant 2 : index
      %c1_10 = arith.constant 1 : index
      %c32 = arith.constant 32 : index
      %c32_11 = arith.constant 32 : index
      %c1024 = arith.constant 1024 : index
      %c1024_12 = arith.constant 1024 : index
      %c32_13 = arith.constant 32 : index
      %c1_14 = arith.constant 1 : index
      %c0_15 = arith.constant 0 : index
      %c0_16 = arith.constant 0 : index
      %c0_17 = arith.constant 0 : index
      %c2_18 = arith.constant 2 : index
      %c1_19 = arith.constant 1 : index
      %c32_20 = arith.constant 32 : index
      %c32_21 = arith.constant 32 : index
      %c8192 = arith.constant 8192 : index
      %c32_22 = arith.constant 32 : index
      %c256 = arith.constant 256 : index
      %c1_23 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_2[%c0_6, %c0_7, %c0_8, %c0_9] [%c2, %c1_10, %c32, %c32_11] [%c1024, %c1024_12, %c32_13, %c1_14], %0[%c0_15, %c0_16, %arg0, %c0_17] [%c2_18, %c1_19, %c32_20, %c32_21] [%c8192, %c32_22, %c256, %c1_23]) : (memref<2x1x32x32xi32, 1 : i32>, memref<128x256xi32>)
      %subview_24 = memref.subview %1[0, %arg1] [32, 64] [1, 1] : memref<256x128xi32> to memref<32x64xi32, strided<[128, 1], offset: ?>>
      %c0_25 = arith.constant 0 : index
      %c0_26 = arith.constant 0 : index
      %c0_27 = arith.constant 0 : index
      %c0_28 = arith.constant 0 : index
      %c1_29 = arith.constant 1 : index
      %c2_30 = arith.constant 2 : index
      %c32_31 = arith.constant 32 : index
      %c32_32 = arith.constant 32 : index
      %c2048 = arith.constant 2048 : index
      %c1024_33 = arith.constant 1024 : index
      %c32_34 = arith.constant 32 : index
      %c1_35 = arith.constant 1 : index
      %c0_36 = arith.constant 0 : index
      %c0_37 = arith.constant 0 : index
      %c0_38 = arith.constant 0 : index
      %c1_39 = arith.constant 1 : index
      %c2_40 = arith.constant 2 : index
      %c32_41 = arith.constant 32 : index
      %c32_42 = arith.constant 32 : index
      %c4096 = arith.constant 4096 : index
      %c32_43 = arith.constant 32 : index
      %c128 = arith.constant 128 : index
      %c1_44 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_1[%c0_25, %c0_26, %c0_27, %c0_28] [%c1_29, %c2_30, %c32_31, %c32_32] [%c2048, %c1024_33, %c32_34, %c1_35], %1[%c0_36, %c0_37, %c0_38, %arg1] [%c1_39, %c2_40, %c32_41, %c32_42] [%c4096, %c32_43, %c128, %c1_44]) : (memref<1x2x32x32xi32, 1 : i32>, memref<256x128xi32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_107 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        %c0_108 = arith.constant 0 : index
        %c0_109 = arith.constant 0 : index
        %c0_110 = arith.constant 0 : index
        %c0_111 = arith.constant 0 : index
        %c0_112 = arith.constant 0 : index
        %c0_113 = arith.constant 0 : index
        %c1_114 = arith.constant 1 : index
        %c1_115 = arith.constant 1 : index
        %c4_116 = arith.constant 4 : index
        %c8_117 = arith.constant 8 : index
        %c4_118 = arith.constant 4 : index
        %c8_119 = arith.constant 8 : index
        %c1024_120 = arith.constant 1024 : index
        %c1024_121 = arith.constant 1024 : index
        %c256_122 = arith.constant 256 : index
        %c32_123 = arith.constant 32 : index
        %c8_124 = arith.constant 8 : index
        %c1_125 = arith.constant 1 : index
        %c0_126 = arith.constant 0 : index
        %c0_127 = arith.constant 0 : index
        %c0_128 = arith.constant 0 : index
        %c0_129 = arith.constant 0 : index
        %c0_130 = arith.constant 0 : index
        %c1_131 = arith.constant 1 : index
        %c1_132 = arith.constant 1 : index
        %c4_133 = arith.constant 4 : index
        %c8_134 = arith.constant 8 : index
        %c4_135 = arith.constant 4 : index
        %c8_136 = arith.constant 8 : index
        %c1024_137 = arith.constant 1024 : index
        %c1024_138 = arith.constant 1024 : index
        %c8_139 = arith.constant 8 : index
        %c128_140 = arith.constant 128 : index
        %c32_141 = arith.constant 32 : index
        %c1_142 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_0[%c0_108, %c0_109, %c0_110, %c0_111, %c0_112, %c0_113] [%c1_114, %c1_115, %c4_116, %c8_117, %c4_118, %c8_119] [%c1024_120, %c1024_121, %c256_122, %c32_123, %c8_124, %c1_125], %alloc_2[%arg2, %c0_126, %c0_127, %c0_128, %c0_129, %c0_130] [%c1_131, %c1_132, %c4_133, %c8_134, %c4_135, %c8_136] [%c1024_137, %c1024_138, %c8_139, %c128_140, %c32_141, %c1_142]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<2x1x32x32xi32, 1 : i32>)
        %subview_143 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        %c0_144 = arith.constant 0 : index
        %c0_145 = arith.constant 0 : index
        %c0_146 = arith.constant 0 : index
        %c0_147 = arith.constant 0 : index
        %c0_148 = arith.constant 0 : index
        %c0_149 = arith.constant 0 : index
        %c1_150 = arith.constant 1 : index
        %c1_151 = arith.constant 1 : index
        %c8_152 = arith.constant 8 : index
        %c4_153 = arith.constant 4 : index
        %c8_154 = arith.constant 8 : index
        %c4_155 = arith.constant 4 : index
        %c1024_156 = arith.constant 1024 : index
        %c1024_157 = arith.constant 1024 : index
        %c128_158 = arith.constant 128 : index
        %c32_159 = arith.constant 32 : index
        %c4_160 = arith.constant 4 : index
        %c1_161 = arith.constant 1 : index
        %c0_162 = arith.constant 0 : index
        %c0_163 = arith.constant 0 : index
        %c0_164 = arith.constant 0 : index
        %c0_165 = arith.constant 0 : index
        %c0_166 = arith.constant 0 : index
        %c1_167 = arith.constant 1 : index
        %c1_168 = arith.constant 1 : index
        %c8_169 = arith.constant 8 : index
        %c4_170 = arith.constant 4 : index
        %c8_171 = arith.constant 8 : index
        %c4_172 = arith.constant 4 : index
        %c2048_173 = arith.constant 2048 : index
        %c1024_174 = arith.constant 1024 : index
        %c4_175 = arith.constant 4 : index
        %c256_176 = arith.constant 256 : index
        %c32_177 = arith.constant 32 : index
        %c1_178 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc[%c0_144, %c0_145, %c0_146, %c0_147, %c0_148, %c0_149] [%c1_150, %c1_151, %c8_152, %c4_153, %c8_154, %c4_155] [%c1024_156, %c1024_157, %c128_158, %c32_159, %c4_160, %c1_161], %alloc_1[%c0_162, %arg3, %c0_163, %c0_164, %c0_165, %c0_166] [%c1_167, %c1_168, %c8_169, %c4_170, %c8_171, %c4_172] [%c2048_173, %c1024_174, %c4_175, %c256_176, %c32_177, %c1_178]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x2x32x32xi32, 1 : i32>)
        %subview_179 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%subview_179 : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_180 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_181 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_182 = memref.subview %subview_179[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_180, %subview_181 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_182 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_183: i32, %out: i32):
                %3 = arith.muli %in, %in_183 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c7 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %subview_107 = memref.subview %0[%arg0, %3] [64, 32] [1, 1] : memref<128x256xi32> to memref<64x32xi32, strided<[256, 1], offset: ?>>
        %c0_108 = arith.constant 0 : index
        %c0_109 = arith.constant 0 : index
        %c0_110 = arith.constant 0 : index
        %c0_111 = arith.constant 0 : index
        %c2_112 = arith.constant 2 : index
        %c1_113 = arith.constant 1 : index
        %c32_114 = arith.constant 32 : index
        %c32_115 = arith.constant 32 : index
        %c1024_116 = arith.constant 1024 : index
        %c1024_117 = arith.constant 1024 : index
        %c32_118 = arith.constant 32 : index
        %c1_119 = arith.constant 1 : index
        %c0_120 = arith.constant 0 : index
        %c0_121 = arith.constant 0 : index
        %c2_122 = arith.constant 2 : index
        %c1_123 = arith.constant 1 : index
        %c32_124 = arith.constant 32 : index
        %c32_125 = arith.constant 32 : index
        %c8192_126 = arith.constant 8192 : index
        %c32_127 = arith.constant 32 : index
        %c256_128 = arith.constant 256 : index
        %c1_129 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_2[%c0_108, %c0_109, %c0_110, %c0_111] [%c2_112, %c1_113, %c32_114, %c32_115] [%c1024_116, %c1024_117, %c32_118, %c1_119], %0[%c0_120, %c0_121, %arg0, %3] [%c2_122, %c1_123, %c32_124, %c32_125] [%c8192_126, %c32_127, %c256_128, %c1_129]) : (memref<2x1x32x32xi32, 1 : i32>, memref<128x256xi32>)
        %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %subview_130 = memref.subview %1[%4, %arg1] [32, 64] [1, 1] : memref<256x128xi32> to memref<32x64xi32, strided<[128, 1], offset: ?>>
        %c0_131 = arith.constant 0 : index
        %c0_132 = arith.constant 0 : index
        %c0_133 = arith.constant 0 : index
        %c0_134 = arith.constant 0 : index
        %c1_135 = arith.constant 1 : index
        %c2_136 = arith.constant 2 : index
        %c32_137 = arith.constant 32 : index
        %c32_138 = arith.constant 32 : index
        %c2048_139 = arith.constant 2048 : index
        %c1024_140 = arith.constant 1024 : index
        %c32_141 = arith.constant 32 : index
        %c1_142 = arith.constant 1 : index
        %c0_143 = arith.constant 0 : index
        %c0_144 = arith.constant 0 : index
        %c1_145 = arith.constant 1 : index
        %c2_146 = arith.constant 2 : index
        %c32_147 = arith.constant 32 : index
        %c32_148 = arith.constant 32 : index
        %c4096_149 = arith.constant 4096 : index
        %c32_150 = arith.constant 32 : index
        %c128_151 = arith.constant 128 : index
        %c1_152 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_1[%c0_131, %c0_132, %c0_133, %c0_134] [%c1_135, %c2_136, %c32_137, %c32_138] [%c2048_139, %c1024_140, %c32_141, %c1_142], %1[%c0_143, %c0_144, %4, %arg1] [%c1_145, %c2_146, %c32_147, %c32_148] [%c4096_149, %c32_150, %c128_151, %c1_152]) : (memref<1x2x32x32xi32, 1 : i32>, memref<256x128xi32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %subview_153 = memref.subview %alloc_2[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
          %c0_154 = arith.constant 0 : index
          %c0_155 = arith.constant 0 : index
          %c0_156 = arith.constant 0 : index
          %c0_157 = arith.constant 0 : index
          %c0_158 = arith.constant 0 : index
          %c0_159 = arith.constant 0 : index
          %c1_160 = arith.constant 1 : index
          %c1_161 = arith.constant 1 : index
          %c4_162 = arith.constant 4 : index
          %c8_163 = arith.constant 8 : index
          %c4_164 = arith.constant 4 : index
          %c8_165 = arith.constant 8 : index
          %c1024_166 = arith.constant 1024 : index
          %c1024_167 = arith.constant 1024 : index
          %c256_168 = arith.constant 256 : index
          %c32_169 = arith.constant 32 : index
          %c8_170 = arith.constant 8 : index
          %c1_171 = arith.constant 1 : index
          %c0_172 = arith.constant 0 : index
          %c0_173 = arith.constant 0 : index
          %c0_174 = arith.constant 0 : index
          %c0_175 = arith.constant 0 : index
          %c0_176 = arith.constant 0 : index
          %c1_177 = arith.constant 1 : index
          %c1_178 = arith.constant 1 : index
          %c4_179 = arith.constant 4 : index
          %c8_180 = arith.constant 8 : index
          %c4_181 = arith.constant 4 : index
          %c8_182 = arith.constant 8 : index
          %c1024_183 = arith.constant 1024 : index
          %c1024_184 = arith.constant 1024 : index
          %c8_185 = arith.constant 8 : index
          %c128_186 = arith.constant 128 : index
          %c32_187 = arith.constant 32 : index
          %c1_188 = arith.constant 1 : index
          air.dma_memcpy_nd (%alloc_0[%c0_154, %c0_155, %c0_156, %c0_157, %c0_158, %c0_159] [%c1_160, %c1_161, %c4_162, %c8_163, %c4_164, %c8_165] [%c1024_166, %c1024_167, %c256_168, %c32_169, %c8_170, %c1_171], %alloc_2[%arg3, %c0_172, %c0_173, %c0_174, %c0_175, %c0_176] [%c1_177, %c1_178, %c4_179, %c8_180, %c4_181, %c8_182] [%c1024_183, %c1024_184, %c8_185, %c128_186, %c32_187, %c1_188]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<2x1x32x32xi32, 1 : i32>)
          %subview_189 = memref.subview %alloc_1[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
          %c0_190 = arith.constant 0 : index
          %c0_191 = arith.constant 0 : index
          %c0_192 = arith.constant 0 : index
          %c0_193 = arith.constant 0 : index
          %c0_194 = arith.constant 0 : index
          %c0_195 = arith.constant 0 : index
          %c1_196 = arith.constant 1 : index
          %c1_197 = arith.constant 1 : index
          %c8_198 = arith.constant 8 : index
          %c4_199 = arith.constant 4 : index
          %c8_200 = arith.constant 8 : index
          %c4_201 = arith.constant 4 : index
          %c1024_202 = arith.constant 1024 : index
          %c1024_203 = arith.constant 1024 : index
          %c128_204 = arith.constant 128 : index
          %c32_205 = arith.constant 32 : index
          %c4_206 = arith.constant 4 : index
          %c1_207 = arith.constant 1 : index
          %c0_208 = arith.constant 0 : index
          %c0_209 = arith.constant 0 : index
          %c0_210 = arith.constant 0 : index
          %c0_211 = arith.constant 0 : index
          %c0_212 = arith.constant 0 : index
          %c1_213 = arith.constant 1 : index
          %c1_214 = arith.constant 1 : index
          %c8_215 = arith.constant 8 : index
          %c4_216 = arith.constant 4 : index
          %c8_217 = arith.constant 8 : index
          %c4_218 = arith.constant 4 : index
          %c2048_219 = arith.constant 2048 : index
          %c1024_220 = arith.constant 1024 : index
          %c4_221 = arith.constant 4 : index
          %c256_222 = arith.constant 256 : index
          %c32_223 = arith.constant 32 : index
          %c1_224 = arith.constant 1 : index
          air.dma_memcpy_nd (%alloc[%c0_190, %c0_191, %c0_192, %c0_193, %c0_194, %c0_195] [%c1_196, %c1_197, %c8_198, %c4_199, %c8_200, %c4_201] [%c1024_202, %c1024_203, %c128_204, %c32_205, %c4_206, %c1_207], %alloc_1[%c0_208, %arg4, %c0_209, %c0_210, %c0_211, %c0_212] [%c1_213, %c1_214, %c8_215, %c4_216, %c8_217, %c4_218] [%c2048_219, %c1024_220, %c4_221, %c256_222, %c32_223, %c1_224]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x2x32x32xi32, 1 : i32>)
          %subview_225 = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              scf.for %arg7 = %c0 to %c4 step %c1 {
                %subview_226 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_227 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_228 = memref.subview %subview_225[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_226, %subview_227 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_228 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_229: i32, %out: i32):
                  %5 = arith.muli %in, %in_229 : i32
                  %6 = arith.addi %out, %5 : i32
                  linalg.yield %6 : i32
                }
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %subview_45 = memref.subview %0[%arg0, 224] [64, 32] [1, 1] : memref<128x256xi32> to memref<64x32xi32, strided<[256, 1], offset: ?>>
      %c0_46 = arith.constant 0 : index
      %c0_47 = arith.constant 0 : index
      %c0_48 = arith.constant 0 : index
      %c0_49 = arith.constant 0 : index
      %c2_50 = arith.constant 2 : index
      %c1_51 = arith.constant 1 : index
      %c32_52 = arith.constant 32 : index
      %c32_53 = arith.constant 32 : index
      %c1024_54 = arith.constant 1024 : index
      %c1024_55 = arith.constant 1024 : index
      %c32_56 = arith.constant 32 : index
      %c1_57 = arith.constant 1 : index
      %c0_58 = arith.constant 0 : index
      %c0_59 = arith.constant 0 : index
      %c224 = arith.constant 224 : index
      %c2_60 = arith.constant 2 : index
      %c1_61 = arith.constant 1 : index
      %c32_62 = arith.constant 32 : index
      %c32_63 = arith.constant 32 : index
      %c8192_64 = arith.constant 8192 : index
      %c32_65 = arith.constant 32 : index
      %c256_66 = arith.constant 256 : index
      %c1_67 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_2[%c0_46, %c0_47, %c0_48, %c0_49] [%c2_50, %c1_51, %c32_52, %c32_53] [%c1024_54, %c1024_55, %c32_56, %c1_57], %0[%c0_58, %c0_59, %arg0, %c224] [%c2_60, %c1_61, %c32_62, %c32_63] [%c8192_64, %c32_65, %c256_66, %c1_67]) : (memref<2x1x32x32xi32, 1 : i32>, memref<128x256xi32>)
      %subview_68 = memref.subview %1[224, %arg1] [32, 64] [1, 1] : memref<256x128xi32> to memref<32x64xi32, strided<[128, 1], offset: ?>>
      %c0_69 = arith.constant 0 : index
      %c0_70 = arith.constant 0 : index
      %c0_71 = arith.constant 0 : index
      %c0_72 = arith.constant 0 : index
      %c1_73 = arith.constant 1 : index
      %c2_74 = arith.constant 2 : index
      %c32_75 = arith.constant 32 : index
      %c32_76 = arith.constant 32 : index
      %c2048_77 = arith.constant 2048 : index
      %c1024_78 = arith.constant 1024 : index
      %c32_79 = arith.constant 32 : index
      %c1_80 = arith.constant 1 : index
      %c0_81 = arith.constant 0 : index
      %c0_82 = arith.constant 0 : index
      %c224_83 = arith.constant 224 : index
      %c1_84 = arith.constant 1 : index
      %c2_85 = arith.constant 2 : index
      %c32_86 = arith.constant 32 : index
      %c32_87 = arith.constant 32 : index
      %c4096_88 = arith.constant 4096 : index
      %c32_89 = arith.constant 32 : index
      %c128_90 = arith.constant 128 : index
      %c1_91 = arith.constant 1 : index
      air.dma_memcpy_nd (%alloc_1[%c0_69, %c0_70, %c0_71, %c0_72] [%c1_73, %c2_74, %c32_75, %c32_76] [%c2048_77, %c1024_78, %c32_79, %c1_80], %1[%c0_81, %c0_82, %c224_83, %arg1] [%c1_84, %c2_85, %c32_86, %c32_87] [%c4096_88, %c32_89, %c128_90, %c1_91]) : (memref<1x2x32x32xi32, 1 : i32>, memref<256x128xi32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %subview_107 = memref.subview %alloc_2[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32>
        %c0_108 = arith.constant 0 : index
        %c0_109 = arith.constant 0 : index
        %c0_110 = arith.constant 0 : index
        %c0_111 = arith.constant 0 : index
        %c0_112 = arith.constant 0 : index
        %c0_113 = arith.constant 0 : index
        %c1_114 = arith.constant 1 : index
        %c1_115 = arith.constant 1 : index
        %c4_116 = arith.constant 4 : index
        %c8_117 = arith.constant 8 : index
        %c4_118 = arith.constant 4 : index
        %c8_119 = arith.constant 8 : index
        %c1024_120 = arith.constant 1024 : index
        %c1024_121 = arith.constant 1024 : index
        %c256_122 = arith.constant 256 : index
        %c32_123 = arith.constant 32 : index
        %c8_124 = arith.constant 8 : index
        %c1_125 = arith.constant 1 : index
        %c0_126 = arith.constant 0 : index
        %c0_127 = arith.constant 0 : index
        %c0_128 = arith.constant 0 : index
        %c0_129 = arith.constant 0 : index
        %c0_130 = arith.constant 0 : index
        %c1_131 = arith.constant 1 : index
        %c1_132 = arith.constant 1 : index
        %c4_133 = arith.constant 4 : index
        %c8_134 = arith.constant 8 : index
        %c4_135 = arith.constant 4 : index
        %c8_136 = arith.constant 8 : index
        %c1024_137 = arith.constant 1024 : index
        %c1024_138 = arith.constant 1024 : index
        %c8_139 = arith.constant 8 : index
        %c128_140 = arith.constant 128 : index
        %c32_141 = arith.constant 32 : index
        %c1_142 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_0[%c0_108, %c0_109, %c0_110, %c0_111, %c0_112, %c0_113] [%c1_114, %c1_115, %c4_116, %c8_117, %c4_118, %c8_119] [%c1024_120, %c1024_121, %c256_122, %c32_123, %c8_124, %c1_125], %alloc_2[%arg2, %c0_126, %c0_127, %c0_128, %c0_129, %c0_130] [%c1_131, %c1_132, %c4_133, %c8_134, %c4_135, %c8_136] [%c1024_137, %c1024_138, %c8_139, %c128_140, %c32_141, %c1_142]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<2x1x32x32xi32, 1 : i32>)
        %subview_143 = memref.subview %alloc_1[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        %c0_144 = arith.constant 0 : index
        %c0_145 = arith.constant 0 : index
        %c0_146 = arith.constant 0 : index
        %c0_147 = arith.constant 0 : index
        %c0_148 = arith.constant 0 : index
        %c0_149 = arith.constant 0 : index
        %c1_150 = arith.constant 1 : index
        %c1_151 = arith.constant 1 : index
        %c8_152 = arith.constant 8 : index
        %c4_153 = arith.constant 4 : index
        %c8_154 = arith.constant 8 : index
        %c4_155 = arith.constant 4 : index
        %c1024_156 = arith.constant 1024 : index
        %c1024_157 = arith.constant 1024 : index
        %c128_158 = arith.constant 128 : index
        %c32_159 = arith.constant 32 : index
        %c4_160 = arith.constant 4 : index
        %c1_161 = arith.constant 1 : index
        %c0_162 = arith.constant 0 : index
        %c0_163 = arith.constant 0 : index
        %c0_164 = arith.constant 0 : index
        %c0_165 = arith.constant 0 : index
        %c0_166 = arith.constant 0 : index
        %c1_167 = arith.constant 1 : index
        %c1_168 = arith.constant 1 : index
        %c8_169 = arith.constant 8 : index
        %c4_170 = arith.constant 4 : index
        %c8_171 = arith.constant 8 : index
        %c4_172 = arith.constant 4 : index
        %c2048_173 = arith.constant 2048 : index
        %c1024_174 = arith.constant 1024 : index
        %c4_175 = arith.constant 4 : index
        %c256_176 = arith.constant 256 : index
        %c32_177 = arith.constant 32 : index
        %c1_178 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc[%c0_144, %c0_145, %c0_146, %c0_147, %c0_148, %c0_149] [%c1_150, %c1_151, %c8_152, %c4_153, %c8_154, %c4_155] [%c1024_156, %c1024_157, %c128_158, %c32_159, %c4_160, %c1_161], %alloc_1[%c0_162, %arg3, %c0_163, %c0_164, %c0_165, %c0_166] [%c1_167, %c1_168, %c8_169, %c4_170, %c8_171, %c4_172] [%c2048_173, %c1024_174, %c4_175, %c256_176, %c32_177, %c1_178]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x2x32x32xi32, 1 : i32>)
        %subview_179 = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_206 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_207 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_208 = memref.subview %subview_179[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_206, %subview_207 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_208 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_209: i32, %out: i32):
                %3 = arith.muli %in, %in_209 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
        %subview_180 = memref.subview %alloc_4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>
        %c0_181 = arith.constant 0 : index
        %c0_182 = arith.constant 0 : index
        %c1_183 = arith.constant 1 : index
        %c1_184 = arith.constant 1 : index
        %c32_185 = arith.constant 32 : index
        %c32_186 = arith.constant 32 : index
        %c2048_187 = arith.constant 2048 : index
        %c1024_188 = arith.constant 1024 : index
        %c32_189 = arith.constant 32 : index
        %c1_190 = arith.constant 1 : index
        %c0_191 = arith.constant 0 : index
        %c0_192 = arith.constant 0 : index
        %c0_193 = arith.constant 0 : index
        %c0_194 = arith.constant 0 : index
        %c1_195 = arith.constant 1 : index
        %c1_196 = arith.constant 1 : index
        %c8_197 = arith.constant 8 : index
        %c4_198 = arith.constant 4 : index
        %c8_199 = arith.constant 8 : index
        %c4_200 = arith.constant 4 : index
        %c2048_201 = arith.constant 2048 : index
        %c1024_202 = arith.constant 1024 : index
        %c16 = arith.constant 16 : index
        %c4_203 = arith.constant 4 : index
        %c128_204 = arith.constant 128 : index
        %c1_205 = arith.constant 1 : index
        air.dma_memcpy_nd (%alloc_4[%arg2, %arg3, %c0_181, %c0_182] [%c1_183, %c1_184, %c32_185, %c32_186] [%c2048_187, %c1024_188, %c32_189, %c1_190], %alloc_3[%arg2, %arg3, %c0_191, %c0_192, %c0_193, %c0_194] [%c1_195, %c1_196, %c8_197, %c4_198, %c8_199, %c4_200] [%c2048_201, %c1024_202, %c16, %c4_203, %c128_204, %c1_205]) : (memref<2x2x32x32xi32, 1 : i32>, memref<2x2x8x8x4x4xi32, 2 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c64 = arith.constant 64 : index
      %c64_92 = arith.constant 64 : index
      %c128_93 = arith.constant 128 : index
      %c1_94 = arith.constant 1 : index
      %c0_95 = arith.constant 0 : index
      %c0_96 = arith.constant 0 : index
      %c0_97 = arith.constant 0 : index
      %c0_98 = arith.constant 0 : index
      %c2_99 = arith.constant 2 : index
      %c32_100 = arith.constant 32 : index
      %c2_101 = arith.constant 2 : index
      %c32_102 = arith.constant 32 : index
      %c2048_103 = arith.constant 2048 : index
      %c32_104 = arith.constant 32 : index
      %c1024_105 = arith.constant 1024 : index
      %c1_106 = arith.constant 1 : index
      air.dma_memcpy_nd (%2[%arg0, %arg1] [%c64, %c64_92] [%c128_93, %c1_94], %alloc_4[%c0_95, %c0_96, %c0_97, %c0_98] [%c2_99, %c32_100, %c2_101, %c32_102] [%c2048_103, %c32_104, %c1024_105, %c1_106]) : (memref<128x128xi32>, memref<2x2x32x32xi32, 1 : i32>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After CopyToDma (air-copy-to-dma) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c64 = arith.constant 64 : index
    %c16 = arith.constant 16 : index
    %c224 = arith.constant 224 : index
    %c128 = arith.constant 128 : index
    %c4096 = arith.constant 4096 : index
    %c2048 = arith.constant 2048 : index
    %c256 = arith.constant 256 : index
    %c8192 = arith.constant 8192 : index
    %c1024 = arith.constant 1024 : index
    %c32 = arith.constant 32 : index
    %c2 = arith.constant 2 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    memref.assume_alignment %0, 64 : memref<128x256xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    memref.assume_alignment %1, 64 : memref<256x128xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    memref.assume_alignment %2, 64 : memref<128x128xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
      air.dma_memcpy_nd (%alloc_2[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %0[%c0, %c0, %arg0, %c0] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (memref<2x1x32x32xi32, 1 : i32>, memref<128x256xi32>)
      air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %c0, %arg1] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (memref<1x2x32x32xi32, 1 : i32>, memref<256x128xi32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %alloc_2[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<2x1x32x32xi32, 1 : i32>)
        air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %alloc_1[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x2x32x32xi32, 1 : i32>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_5 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_6 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_7 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_8: i32, %out: i32):
                %3 = arith.muli %in, %in_8 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c7 step %c1 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        air.dma_memcpy_nd (%alloc_2[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %0[%c0, %c0, %arg0, %3] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (memref<2x1x32x32xi32, 1 : i32>, memref<128x256xi32>)
        %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %4, %arg1] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (memref<1x2x32x32xi32, 1 : i32>, memref<256x128xi32>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %alloc_2[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<2x1x32x32xi32, 1 : i32>)
          air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %alloc_1[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x2x32x32xi32, 1 : i32>)
          %subview = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              scf.for %arg7 = %c0 to %c4 step %c1 {
                %subview_5 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_6 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_7 = memref.subview %subview[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_8: i32, %out: i32):
                  %5 = arith.muli %in, %in_8 : i32
                  %6 = arith.addi %out, %5 : i32
                  linalg.yield %6 : i32
                }
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      air.dma_memcpy_nd (%alloc_2[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %0[%c0, %c0, %arg0, %c224] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (memref<2x1x32x32xi32, 1 : i32>, memref<128x256xi32>)
      air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %1[%c0, %c0, %c224, %arg1] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (memref<1x2x32x32xi32, 1 : i32>, memref<256x128xi32>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %alloc_2[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<2x1x32x32xi32, 1 : i32>)
        air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %alloc_1[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x2x32x32xi32, 1 : i32>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_5 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_6 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_7 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_8: i32, %out: i32):
                %3 = arith.muli %in, %in_8 : i32
                %4 = arith.addi %out, %3 : i32
                linalg.yield %4 : i32
              }
            }
          }
        }
        air.dma_memcpy_nd (%alloc_4[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %alloc_3[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c16, %c4, %c128, %c1]) : (memref<2x2x32x32xi32, 1 : i32>, memref<2x2x8x8x4x4xi32, 2 : i32>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      air.dma_memcpy_nd (%2[%arg0, %arg1] [%c64, %c64] [%c128, %c1], %alloc_4[%c0, %c0, %c0, %c0] [%c2, %c32, %c2, %c32] [%c2048, %c32, %c1024, %c1]) : (memref<128x128xi32>, memref<2x2x32x32xi32, 1 : i32>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After AMDAIEAIRDmaToAMDAIEDma (iree-amdaie-air-dma-to-amdaie-dma) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c64 = arith.constant 64 : index
    %c16 = arith.constant 16 : index
    %c224 = arith.constant 224 : index
    %c128 = arith.constant 128 : index
    %c4096 = arith.constant 4096 : index
    %c2048 = arith.constant 2048 : index
    %c256 = arith.constant 256 : index
    %c8192 = arith.constant 8192 : index
    %c1024 = arith.constant 1024 : index
    %c32 = arith.constant 32 : index
    %c2 = arith.constant 2 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %7 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %8 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %9 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %10 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %11 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %12 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %13 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %14 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %17 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %19 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %20 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    %22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    memref.assume_alignment %21, 64 : memref<128x256xi32>
    %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    %26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    memref.assume_alignment %25, 64 : memref<256x128xi32>
    %29 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    %30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
    memref.assume_alignment %29, 64 : memref<128x128xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) {
      %31 = amdaie.dma_cpy_nd(%12[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %22[%c0, %c0, %arg0, %c0] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %32 = amdaie.dma_cpy_nd(%6[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %26[%c0, %c0, %c0, %arg1] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %36 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %13[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %37 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %7[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_5 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_6 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_7 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_8: i32, %out: i32):
                %38 = arith.muli %in, %in_8 : i32
                %39 = arith.addi %out, %38 : i32
                linalg.yield %39 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.for %arg2 = %c1 to %c7 step %c1 {
        %36 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %37 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %23[%c0, %c0, %arg0, %36] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
        %38 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %39 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %27[%c0, %c0, %38, %arg1] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %40 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %15[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
          %41 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %9[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
          %subview = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              scf.for %arg7 = %c0 to %c4 step %c1 {
                %subview_5 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_6 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_7 = memref.subview %subview[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_8: i32, %out: i32):
                  %42 = arith.muli %in, %in_8 : i32
                  %43 = arith.addi %out, %42 : i32
                  linalg.yield %43 : i32
                }
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %33 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %24[%c0, %c0, %arg0, %c224] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %34 = amdaie.dma_cpy_nd(%10[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %28[%c0, %c0, %c224, %arg1] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %36 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %17[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %37 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %11[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_5 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_6 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_7 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_8: i32, %out: i32):
                %39 = arith.muli %in, %in_8 : i32
                %40 = arith.addi %out, %39 : i32
                linalg.yield %40 : i32
              }
            }
          }
        }
        %38 = amdaie.dma_cpy_nd(%19[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %18[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %35 = amdaie.dma_cpy_nd(%30[%arg0, %arg1] [%c64, %c64] [%c128, %c1], %20[%c0, %c0, %c0, %c0] [%c2, %c32, %c2, %c32] [%c2048, %c32, %c1024, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After AMDAIENormalizeLoopBounds (iree-amdaie-normalize-loop-bounds) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c64 = arith.constant 64 : index
    %c16 = arith.constant 16 : index
    %c224 = arith.constant 224 : index
    %c128 = arith.constant 128 : index
    %c4096 = arith.constant 4096 : index
    %c2048 = arith.constant 2048 : index
    %c256 = arith.constant 256 : index
    %c8192 = arith.constant 8192 : index
    %c1024 = arith.constant 1024 : index
    %c32 = arith.constant 32 : index
    %c2 = arith.constant 2 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %7 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %8 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %9 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %10 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %11 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %12 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %13 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %14 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %17 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %19 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %20 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    %22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    memref.assume_alignment %21, 64 : memref<128x256xi32>
    %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    %26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    memref.assume_alignment %25, 64 : memref<256x128xi32>
    %29 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    %30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
    memref.assume_alignment %29, 64 : memref<128x128xi32>
    %c2_5 = arith.constant 2 : index
    %c1_6 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c1_8 = arith.constant 1 : index
    scf.forall (%arg0, %arg1) in (2, 2) {
      %31 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1)
      %32 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)
      %33 = amdaie.dma_cpy_nd(%12[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %22[%c0, %c0, %32, %c0] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %34 = amdaie.dma_cpy_nd(%6[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %26[%c0, %c0, %c0, %31] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %13[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %7[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_10 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_11 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_12 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_10, %subview_11 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_12 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_13: i32, %out: i32):
                %40 = arith.muli %in, %in_13 : i32
                %41 = arith.addi %out, %40 : i32
                linalg.yield %41 : i32
              }
            }
          }
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c6 = arith.constant 6 : index
      %c0_9 = arith.constant 0 : index
      scf.for %arg2 = %c0_9 to %c6 step %c1 {
        %38 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %39 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38)
        %40 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %23[%c0, %c0, %32, %39] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
        %41 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38)
        %42 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %27[%c0, %c0, %41, %31] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %43 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %15[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
          %44 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %9[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
          %subview = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c8 step %c1 {
              scf.for %arg7 = %c0 to %c4 step %c1 {
                %subview_10 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_11 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_12 = memref.subview %subview[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_10, %subview_11 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_12 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_13: i32, %out: i32):
                  %45 = arith.muli %in, %in_13 : i32
                  %46 = arith.addi %out, %45 : i32
                  linalg.yield %46 : i32
                }
              }
            }
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %35 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %24[%c0, %c0, %32, %c224] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %36 = amdaie.dma_cpy_nd(%10[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %28[%c0, %c0, %c224, %31] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %17[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %11[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        scf.for %arg4 = %c0 to %c8 step %c1 {
          scf.for %arg5 = %c0 to %c8 step %c1 {
            scf.for %arg6 = %c0 to %c4 step %c1 {
              %subview_10 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %subview_11 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %subview_12 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_10, %subview_11 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_12 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_13: i32, %out: i32):
                %41 = arith.muli %in, %in_13 : i32
                %42 = arith.addi %out, %41 : i32
                linalg.yield %42 : i32
              }
            }
          }
        }
        %40 = amdaie.dma_cpy_nd(%19[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %18[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %37 = amdaie.dma_cpy_nd(%30[%32, %31] [%c64, %c64] [%c128, %c1], %20[%c0, %c0, %c0, %c0] [%c2, %c32, %c2, %c32] [%c2048, %c32, %c1024, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After AMDAIEInsertCores (iree-amdaie-insert-cores) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c64 = arith.constant 64 : index
    %c16 = arith.constant 16 : index
    %c224 = arith.constant 224 : index
    %c128 = arith.constant 128 : index
    %c4096 = arith.constant 4096 : index
    %c2048 = arith.constant 2048 : index
    %c256 = arith.constant 256 : index
    %c8192 = arith.constant 8192 : index
    %c1024 = arith.constant 1024 : index
    %c32 = arith.constant 32 : index
    %c2 = arith.constant 2 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %7 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %8 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %9 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %10 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %11 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %12 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %13 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %14 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %17 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %19 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %20 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    %22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    memref.assume_alignment %21, 64 : memref<128x256xi32>
    %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    %26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    memref.assume_alignment %25, 64 : memref<256x128xi32>
    %29 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    %30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
    memref.assume_alignment %29, 64 : memref<128x128xi32>
    %c2_5 = arith.constant 2 : index
    %c1_6 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c1_8 = arith.constant 1 : index
    scf.forall (%arg0, %arg1) in (2, 2) {
      %31 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1)
      %32 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)
      %33 = amdaie.dma_cpy_nd(%12[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %22[%c0, %c0, %32, %c0] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %34 = amdaie.dma_cpy_nd(%6[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %26[%c0, %c0, %c0, %31] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %13[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %7[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %c2_10 = arith.constant 2 : index
        %40 = arith.addi %arg2, %c2_10 : index
        %tile = amdaie.tile(%arg3, %40)
        %41 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%38)
          amdaie.logicalobjectfifo.consume(%39)
          linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
          scf.for %arg4 = %c0 to %c8 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c4 step %c1 {
                %subview_11 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_12 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_13 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_11, %subview_12 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_14: i32, %out: i32):
                  %42 = arith.muli %in, %in_14 : i32
                  %43 = arith.addi %out, %42 : i32
                  linalg.yield %43 : i32
                }
              }
            }
          }
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c6 = arith.constant 6 : index
      %c0_9 = arith.constant 0 : index
      scf.for %arg2 = %c0_9 to %c6 step %c1 {
        %38 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %39 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38)
        %40 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %23[%c0, %c0, %32, %39] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
        %41 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38)
        %42 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %27[%c0, %c0, %41, %31] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %43 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %15[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
          %44 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %9[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
          %subview = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %c2_10 = arith.constant 2 : index
          %45 = arith.addi %arg3, %c2_10 : index
          %tile = amdaie.tile(%arg4, %45)
          %46 = amdaie.core(%tile) {
            amdaie.logicalobjectfifo.consume(%43)
            amdaie.logicalobjectfifo.consume(%44)
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                scf.for %arg7 = %c0 to %c4 step %c1 {
                  %subview_11 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_12 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_13 = memref.subview %subview[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_11, %subview_12 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_14: i32, %out: i32):
                    %47 = arith.muli %in, %in_14 : i32
                    %48 = arith.addi %out, %47 : i32
                    linalg.yield %48 : i32
                  }
                }
              }
            }
            amdaie.end
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %35 = amdaie.dma_cpy_nd(%16[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %24[%c0, %c0, %32, %c224] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %36 = amdaie.dma_cpy_nd(%10[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %28[%c0, %c0, %c224, %31] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      scf.forall (%arg2, %arg3) in (2, 2) {
        %38 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %17[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %39 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %11[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %40 = amdaie.dma_cpy_nd(%19[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %18[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>)
        %c2_10 = arith.constant 2 : index
        %41 = arith.addi %arg2, %c2_10 : index
        %tile = amdaie.tile(%arg3, %41)
        %42 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%38)
          amdaie.logicalobjectfifo.consume(%39)
          scf.for %arg4 = %c0 to %c8 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c4 step %c1 {
                %subview_11 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_12 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_13 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_11, %subview_12 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_14: i32, %out: i32):
                  %43 = arith.muli %in, %in_14 : i32
                  %44 = arith.addi %out, %43 : i32
                  linalg.yield %44 : i32
                }
              }
            }
          }
          amdaie.logicalobjectfifo.produce(%40)
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %37 = amdaie.dma_cpy_nd(%30[%32, %31] [%c64, %c64] [%c128, %c1], %20[%c0, %c0, %c0, %c0] [%c2, %c32, %c2, %c32] [%c2048, %c32, %c1024, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After AMDAIELocalizeLogicalObjectfifo (iree-amdaie-localize-logicalobjectfifo) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c64 = arith.constant 64 : index
    %c16 = arith.constant 16 : index
    %c224 = arith.constant 224 : index
    %c128 = arith.constant 128 : index
    %c4096 = arith.constant 4096 : index
    %c2048 = arith.constant 2048 : index
    %c256 = arith.constant 256 : index
    %c8192 = arith.constant 8192 : index
    %c1024 = arith.constant 1024 : index
    %c32 = arith.constant 32 : index
    %c2 = arith.constant 2 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c7 = arith.constant 7 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %3 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %4 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %6 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    %8 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %9 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    %10 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    memref.assume_alignment %7, 64 : memref<128x256xi32>
    %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    %12 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %13 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    %14 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    memref.assume_alignment %11, 64 : memref<256x128xi32>
    %15 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    %16 = amdaie.logicalobjectfifo.from_memref %15, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
    memref.assume_alignment %15, 64 : memref<128x128xi32>
    %c2_5 = arith.constant 2 : index
    %c1_6 = arith.constant 1 : index
    %c2_7 = arith.constant 2 : index
    %c1_8 = arith.constant 1 : index
    scf.forall (%arg0, %arg1) in (2, 2) {
      %17 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1)
      %18 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)
      %19 = amdaie.dma_cpy_nd(%3[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %8[%c0, %c0, %18, %c0] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %20 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %12[%c0, %c0, %c0, %17] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      %21 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
      %22 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
      %23 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
      %24 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %34 = amdaie.dma_cpy_nd(%22[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %24[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %35 = amdaie.dma_cpy_nd(%21[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %23[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %c2_10 = arith.constant 2 : index
        %36 = arith.addi %arg2, %c2_10 : index
        %tile = amdaie.tile(%arg3, %36)
        %37 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%34)
          amdaie.logicalobjectfifo.consume(%35)
          linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
          scf.for %arg4 = %c0 to %c8 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c4 step %c1 {
                %subview_11 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_12 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_13 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_11, %subview_12 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_14: i32, %out: i32):
                  %38 = arith.muli %in, %in_14 : i32
                  %39 = arith.addi %out, %38 : i32
                  linalg.yield %39 : i32
                }
              }
            }
          }
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c6 = arith.constant 6 : index
      %c0_9 = arith.constant 0 : index
      scf.for %arg2 = %c0_9 to %c6 step %c1 {
        %34 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %35 = affine.apply affine_map<(d0) -> (d0 * 32)>(%34)
        %36 = amdaie.dma_cpy_nd(%4[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %9[%c0, %c0, %18, %35] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
        %37 = affine.apply affine_map<(d0) -> (d0 * 32)>(%34)
        %38 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %13[%c0, %c0, %37, %17] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
        %39 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
        %40 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
        %41 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
        %42 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
        scf.forall (%arg3, %arg4) in (2, 2) {
          %43 = amdaie.dma_cpy_nd(%40[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %42[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
          %44 = amdaie.dma_cpy_nd(%39[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %41[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
          %subview = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %c2_10 = arith.constant 2 : index
          %45 = arith.addi %arg3, %c2_10 : index
          %tile = amdaie.tile(%arg4, %45)
          %46 = amdaie.core(%tile) {
            amdaie.logicalobjectfifo.consume(%43)
            amdaie.logicalobjectfifo.consume(%44)
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                scf.for %arg7 = %c0 to %c4 step %c1 {
                  %subview_11 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_12 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_13 = memref.subview %subview[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_11, %subview_12 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_14: i32, %out: i32):
                    %47 = arith.muli %in, %in_14 : i32
                    %48 = arith.addi %out, %47 : i32
                    linalg.yield %48 : i32
                  }
                }
              }
            }
            amdaie.end
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %25 = amdaie.dma_cpy_nd(%5[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %10[%c0, %c0, %18, %c224] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %26 = amdaie.dma_cpy_nd(%2[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %14[%c0, %c0, %c224, %17] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      %27 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
      %28 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
      %29 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
      %30 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
      %31 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>
      %32 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %34 = amdaie.dma_cpy_nd(%28[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %30[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %35 = amdaie.dma_cpy_nd(%27[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %29[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %36 = amdaie.dma_cpy_nd(%32[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %31[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>)
        %c2_10 = arith.constant 2 : index
        %37 = arith.addi %arg2, %c2_10 : index
        %tile = amdaie.tile(%arg3, %37)
        %38 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%34)
          amdaie.logicalobjectfifo.consume(%35)
          scf.for %arg4 = %c0 to %c8 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c4 step %c1 {
                %subview_11 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_12 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_13 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_11, %subview_12 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_13 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_14: i32, %out: i32):
                  %39 = arith.muli %in, %in_14 : i32
                  %40 = arith.addi %out, %39 : i32
                  linalg.yield %40 : i32
                }
              }
            }
          }
          amdaie.logicalobjectfifo.produce(%36)
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %33 = amdaie.dma_cpy_nd(%16[%18, %17] [%c64, %c64] [%c128, %c1], %6[%c0, %c0, %c0, %c0] [%c2, %c32, %c2, %c32] [%c2048, %c32, %c1024, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

// -----// IR Dump After CSE (cse) //----- //
module {
  func.func @matmul_i32_dispatch_0_matmul_128x128x256_i32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %c64 = arith.constant 64 : index
    %c16 = arith.constant 16 : index
    %c224 = arith.constant 224 : index
    %c128 = arith.constant 128 : index
    %c4096 = arith.constant 4096 : index
    %c2048 = arith.constant 2048 : index
    %c256 = arith.constant 256 : index
    %c8192 = arith.constant 8192 : index
    %c1024 = arith.constant 1024 : index
    %c32 = arith.constant 32 : index
    %c2 = arith.constant 2 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
    %alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
    %alloc_1 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
    %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %alloc_2 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
    %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xi32, 2 : i32>
    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
    %2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<128x256xi32>
    %4 = amdaie.logicalobjectfifo.from_memref %3, {} : memref<128x256xi32> -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    memref.assume_alignment %3, 64 : memref<128x256xi32>
    %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x128xi32>
    %6 = amdaie.logicalobjectfifo.from_memref %5, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    memref.assume_alignment %5, 64 : memref<256x128xi32>
    %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<128x128xi32>
    %8 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
    memref.assume_alignment %7, 64 : memref<128x128xi32>
    scf.forall (%arg0, %arg1) in (2, 2) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1)
      %10 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)
      %11 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %4[%c0, %c0, %10, %c0] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %12 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %6[%c0, %c0, %c0, %9] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      %13 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
      %14 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %19 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %1[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %20 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %0[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %21 = arith.addi %arg2, %c2 : index
        %tile = amdaie.tile(%arg3, %21)
        %22 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%19)
          amdaie.logicalobjectfifo.consume(%20)
          linalg.fill ins(%c0_i32 : i32) outs(%subview : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>)
          scf.for %arg4 = %c0 to %c8 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c4 step %c1 {
                %subview_5 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_6 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_7 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_8: i32, %out: i32):
                  %23 = arith.muli %in, %in_8 : i32
                  %24 = arith.addi %out, %23 : i32
                  linalg.yield %24 : i32
                }
              }
            }
          }
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %c6 = arith.constant 6 : index
      scf.for %arg2 = %c0 to %c6 step %c1 {
        %19 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
        %20 = affine.apply affine_map<(d0) -> (d0 * 32)>(%19)
        %21 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %4[%c0, %c0, %10, %20] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
        %22 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %6[%c0, %c0, %20, %9] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
        scf.forall (%arg3, %arg4) in (2, 2) {
          %23 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %1[%arg3, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
          %24 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %0[%c0, %arg4, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
          %subview = memref.subview %alloc_3[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %25 = arith.addi %arg3, %c2 : index
          %tile = amdaie.tile(%arg4, %25)
          %26 = amdaie.core(%tile) {
            amdaie.logicalobjectfifo.consume(%23)
            amdaie.logicalobjectfifo.consume(%24)
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c8 step %c1 {
                scf.for %arg7 = %c0 to %c4 step %c1 {
                  %subview_5 = memref.subview %alloc_0[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                  %subview_6 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                  %subview_7 = memref.subview %subview[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                  ^bb0(%in: i32, %in_8: i32, %out: i32):
                    %27 = arith.muli %in, %in_8 : i32
                    %28 = arith.addi %out, %27 : i32
                    linalg.yield %28 : i32
                  }
                }
              }
            }
            amdaie.end
          }
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      }
      %15 = amdaie.dma_cpy_nd(%1[%c0, %c0, %c0, %c0] [%c2, %c1, %c32, %c32] [%c1024, %c1024, %c32, %c1], %4[%c0, %c0, %10, %c224] [%c2, %c1, %c32, %c32] [%c8192, %c32, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xi32>>)
      %16 = amdaie.dma_cpy_nd(%0[%c0, %c0, %c0, %c0] [%c1, %c2, %c32, %c32] [%c2048, %c1024, %c32, %c1], %6[%c0, %c0, %c224, %9] [%c1, %c2, %c32, %c32] [%c4096, %c32, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
      %17 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<2x2x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>
      scf.forall (%arg2, %arg3) in (2, 2) {
        %19 = amdaie.dma_cpy_nd(%14[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c256, %c32, %c8, %c1], %1[%arg2, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c8, %c4, %c8] [%c1024, %c1024, %c8, %c128, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
        %20 = amdaie.dma_cpy_nd(%13[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c128, %c32, %c4, %c1], %0[%c0, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c4, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
        %subview = memref.subview %alloc_3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xi32, 2 : i32> to memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %21 = amdaie.dma_cpy_nd(%2[%arg2, %arg3, %c0, %c0] [%c1, %c1, %c32, %c32] [%c2048, %c1024, %c32, %c1], %17[%arg2, %arg3, %c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c4, %c8, %c4] [%c2048, %c1024, %c16, %c4, %c128, %c1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xi32, 2 : i32>>)
        %22 = arith.addi %arg2, %c2 : index
        %tile = amdaie.tile(%arg3, %22)
        %23 = amdaie.core(%tile) {
          amdaie.logicalobjectfifo.consume(%19)
          amdaie.logicalobjectfifo.consume(%20)
          scf.for %arg4 = %c0 to %c8 step %c1 {
            scf.for %arg5 = %c0 to %c8 step %c1 {
              scf.for %arg6 = %c0 to %c4 step %c1 {
                %subview_5 = memref.subview %alloc_0[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xi32, 2 : i32> to memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %subview_6 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xi32, 2 : i32> to memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %subview_7 = memref.subview %subview[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%subview_5, %subview_6 : memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>) outs(%subview_7 : memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
                ^bb0(%in: i32, %in_8: i32, %out: i32):
                  %24 = arith.muli %in, %in_8 : i32
                  %25 = arith.addi %out, %24 : i32
                  linalg.yield %25 : i32
                }
              }
            }
          }
          amdaie.logicalobjectfifo.produce(%21)
          amdaie.end
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      %18 = amdaie.dma_cpy_nd(%8[%10, %9] [%c64, %c64] [%c128, %c1], %2[%c0, %c0, %c0, %c0] [%c2, %c32, %c2, %c32] [%c2048, %c32, %c1024, %c1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xi32, 2 : i32>
    memref.dealloc %alloc_2 : memref<2x1x32x32xi32, 1 : i32>
    memref.dealloc %alloc_1 : memref<1x2x32x32xi32, 1 : i32>
    memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xi32, 2 : i32>
    memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
    return
  }
}

<stdin>:20:16: error: expected result type to be 'memref<1x1x1x1x4x4xi32, strided<[1024, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>' or a rank-reduced version. (mismatch of result layout)
          %7 = linalg.matmul ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) -> tensor<128x128xi32>
               ^
<stdin>:20:16: note: see current operation: %347 = "memref.subview"(%29, %arg76, %arg75) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
// -----// IR Dump After AMDAIEDistributeCoresAndObjectFifos Failed (iree-amdaie-distribute-cores-and-objectfifos) //----- //
"builtin.module"() ({
  "func.func"() <{function_type = () -> (), sym_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32"}> ({
    %0 = "arith.constant"() <{value = 3 : index}> : () -> index
    %1 = "arith.constant"() <{value = 6 : index}> : () -> index
    %2 = "arith.constant"() <{value = 64 : index}> : () -> index
    %3 = "arith.constant"() <{value = 16 : index}> : () -> index
    %4 = "arith.constant"() <{value = 224 : index}> : () -> index
    %5 = "arith.constant"() <{value = 128 : index}> : () -> index
    %6 = "arith.constant"() <{value = 4096 : index}> : () -> index
    %7 = "arith.constant"() <{value = 2048 : index}> : () -> index
    %8 = "arith.constant"() <{value = 256 : index}> : () -> index
    %9 = "arith.constant"() <{value = 8192 : index}> : () -> index
    %10 = "arith.constant"() <{value = 1024 : index}> : () -> index
    %11 = "arith.constant"() <{value = 32 : index}> : () -> index
    %12 = "arith.constant"() <{value = 2 : index}> : () -> index
    %13 = "arith.constant"() <{value = 4 : index}> : () -> index
    %14 = "arith.constant"() <{value = 8 : index}> : () -> index
    %15 = "arith.constant"() <{value = 1 : index}> : () -> index
    %16 = "arith.constant"() <{value = 0 : index}> : () -> index
    %17 = "arith.constant"() <{value = 0 : i32}> : () -> i32
    %18 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4x8x4xi32, 2 : i32>
    %19 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x4x8x4x8xi32, 2 : i32>
    %20 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x2x32x32xi32, 1 : i32>
    %21 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x2x32x32xi32, 1 : i32>
    %22 = "amdaie.tile"(%16, %15) : (index, index) -> index
    %23 = "amdaie.tile"(%15, %15) : (index, index) -> index
    %24 = "amdaie.logicalobjectfifo.from_memref"(%20, %22) : (memref<1x2x32x32xi32, 1 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
    %25 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<2x1x32x32xi32, 1 : i32>
    %26 = "amdaie.tile"(%16, %15) : (index, index) -> index
    %27 = "amdaie.tile"(%15, %15) : (index, index) -> index
    %28 = "amdaie.logicalobjectfifo.from_memref"(%20, %26) : (memref<1x2x32x32xi32, 1 : i32>, index) -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
    %29 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x8x4x4xi32, 2 : i32>
    %30 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<2x2x32x32xi32, 1 : i32>
    %31 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<2x2x32x32xi32, 1 : i32>
    %32 = "amdaie.tile"(%16, %15) : (index, index) -> index
    %33 = "amdaie.tile"(%15, %15) : (index, index) -> index
    %34 = "amdaie.logicalobjectfifo.from_memref"(%30, %32) : (memref<2x2x32x32xi32, 1 : i32>, index) -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
    %35 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<128x256xi32>
    %36 = "amdaie.tile"(%16, %16) : (index, index) -> index
    %37 = "amdaie.tile"(%15, %16) : (index, index) -> index
    %38 = "amdaie.logicalobjectfifo.from_memref"(%35, %36) : (memref<128x256xi32>, index) -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
    "memref.assume_alignment"(%35) <{alignment = 64 : i32}> : (memref<128x256xi32>) -> ()
    %39 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<256x128xi32>
    %40 = "amdaie.tile"(%16, %16) : (index, index) -> index
    %41 = "amdaie.tile"(%15, %16) : (index, index) -> index
    %42 = "amdaie.logicalobjectfifo.from_memref"(%39, %40) : (memref<256x128xi32>, index) -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
    "memref.assume_alignment"(%39) <{alignment = 64 : i32}> : (memref<256x128xi32>) -> ()
    %43 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<128x128xi32>
    %44 = "amdaie.tile"(%16, %16) : (index, index) -> index
    %45 = "amdaie.tile"(%15, %16) : (index, index) -> index
    %46 = "amdaie.logicalobjectfifo.from_memref"(%43, %44) : (memref<128x128xi32>, index) -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
    "memref.assume_alignment"(%43) <{alignment = 64 : i32}> : (memref<128x128xi32>) -> ()
    "scf.forall"() <{mapping = [#gpu.block<y>, #gpu.block<x>], operandSegmentSizes = array<i32: 0, 0, 0, 0>, staticLowerBound = array<i64: 0, 0>, staticStep = array<i64: 1, 1>, staticUpperBound = array<i64: 2, 2>}> ({
    ^bb0(%arg0: index, %arg1: index):
      %47 = "affine.apply"(%arg1) <{map = affine_map<(d0) -> (d0 * 64)>}> : (index) -> index
      %48 = "affine.apply"(%arg0) <{map = affine_map<(d0) -> (d0 * 64)>}> : (index) -> index
      %49 = "amdaie.dma_cpy_nd"(%28, %16, %16, %16, %16, %12, %15, %11, %11, %10, %10, %11, %15, %38, %16, %16, %48, %16, %12, %15, %11, %11, %9, %11, %8, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<128x256xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %50 = "amdaie.dma_cpy_nd"(%24, %16, %16, %16, %16, %15, %12, %11, %11, %7, %10, %11, %15, %42, %16, %16, %16, %47, %15, %12, %11, %11, %6, %11, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<256x128xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %51 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %52 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %53 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %54 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %55 = "amdaie.logicalobjectfifo.from_memref"(%18, %54, %52, %53, %51) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
      %56 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %57 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %58 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %59 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %60 = "amdaie.logicalobjectfifo.from_memref"(%18, %59, %57, %58, %56) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
      %61 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %62 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %63 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %64 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %65 = "amdaie.logicalobjectfifo.from_memref"(%18, %64, %62, %63, %61) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
      %66 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %67 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %68 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %69 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %70 = "amdaie.logicalobjectfifo.from_memref"(%18, %69, %67, %68, %66) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
      %71 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %72 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %73 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %74 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %75 = "amdaie.logicalobjectfifo.from_memref"(%19, %74, %72, %73, %71) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
      %76 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %77 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %78 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %79 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %80 = "amdaie.logicalobjectfifo.from_memref"(%19, %79, %77, %78, %76) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
      %81 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %82 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %83 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %84 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %85 = "amdaie.logicalobjectfifo.from_memref"(%19, %84, %82, %83, %81) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
      %86 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %87 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %88 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %89 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %90 = "amdaie.logicalobjectfifo.from_memref"(%19, %89, %87, %88, %86) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
      %91 = "amdaie.dma_cpy_nd"(%70, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %92 = "amdaie.dma_cpy_nd"(%65, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %15, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %93 = "amdaie.dma_cpy_nd"(%90, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %94 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %95 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %96 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %97 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %98 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %99 = "amdaie.logicalobjectfifo.from_memref"(%29, %95, %97, %98, %96) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %100 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %101 = "amdaie.logicalobjectfifo.from_memref"(%345, %100) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %102 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %103 = "amdaie.logicalobjectfifo.from_memref"(%346, %102) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %104 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %105 = "amdaie.logicalobjectfifo.from_memref"(%347, %104) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %106 = "amdaie.core"(%94) ({
        %341 = "amdaie.logicalobjectfifo.access"(%105) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %342 = "amdaie.logicalobjectfifo.access"(%103) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %343 = "amdaie.logicalobjectfifo.access"(%101) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        %344 = "amdaie.logicalobjectfifo.access"(%99) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>) -> memref<1x1x8x8x4x4xi32, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%93) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%91) : (index) -> ()
        "linalg.fill"(%17, %344) <{operandSegmentSizes = array<i32: 1, 1>}> ({
        ^bb0(%arg81: i32, %arg82: i32):
          "linalg.yield"(%arg81) : (i32) -> ()
        }) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg75: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg76: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg77: index):
              %345 = "memref.subview"(%19, %arg77, %arg75) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %346 = "memref.subview"(%18, %arg76, %arg77) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %347 = "memref.subview"(%29, %arg76, %arg75) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%343, %342, %341) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg78: i32, %arg79: i32, %arg80: i32):
                %348 = "arith.muli"(%arg78, %arg79) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %349 = "arith.addi"(%arg80, %348) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%349) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      %107 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %108 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %109 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %110 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %111 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %112 = "amdaie.logicalobjectfifo.from_memref"(%29, %111, %110, %108, %109) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %113 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %114 = "amdaie.logicalobjectfifo.from_memref"(%336, %113) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %115 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %116 = "amdaie.logicalobjectfifo.from_memref"(%337, %115) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %117 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %118 = "amdaie.logicalobjectfifo.from_memref"(%338, %117) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %119 = "amdaie.core"(%107) ({
        %332 = "amdaie.logicalobjectfifo.access"(%118) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %333 = "amdaie.logicalobjectfifo.access"(%116) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %334 = "amdaie.logicalobjectfifo.access"(%114) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        %335 = "amdaie.logicalobjectfifo.access"(%112) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>) -> memref<1x1x8x8x4x4xi32, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%93) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%92) : (index) -> ()
        "linalg.fill"(%17, %335) <{operandSegmentSizes = array<i32: 1, 1>}> ({
        ^bb0(%arg73: i32, %arg74: i32):
          "linalg.yield"(%arg73) : (i32) -> ()
        }) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg67: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg68: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg69: index):
              %336 = "memref.subview"(%19, %arg69, %arg67) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %337 = "memref.subview"(%18, %arg68, %arg69) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %338 = "memref.subview"(%29, %arg68, %arg67) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%334, %333, %332) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg70: i32, %arg71: i32, %arg72: i32):
                %339 = "arith.muli"(%arg70, %arg71) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %340 = "arith.addi"(%arg72, %339) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%340) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      %120 = "amdaie.dma_cpy_nd"(%85, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %15, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %121 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %122 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %123 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %124 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %125 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %126 = "amdaie.logicalobjectfifo.from_memref"(%29, %125, %122, %124, %123) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %127 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %128 = "amdaie.logicalobjectfifo.from_memref"(%327, %127) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %129 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %130 = "amdaie.logicalobjectfifo.from_memref"(%328, %129) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %131 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %132 = "amdaie.logicalobjectfifo.from_memref"(%329, %131) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %133 = "amdaie.core"(%121) ({
        %323 = "amdaie.logicalobjectfifo.access"(%132) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %324 = "amdaie.logicalobjectfifo.access"(%130) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %325 = "amdaie.logicalobjectfifo.access"(%128) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        %326 = "amdaie.logicalobjectfifo.access"(%126) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>) -> memref<1x1x8x8x4x4xi32, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%120) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%91) : (index) -> ()
        "linalg.fill"(%17, %326) <{operandSegmentSizes = array<i32: 1, 1>}> ({
        ^bb0(%arg65: i32, %arg66: i32):
          "linalg.yield"(%arg65) : (i32) -> ()
        }) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg59: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg60: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg61: index):
              %327 = "memref.subview"(%19, %arg61, %arg59) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %328 = "memref.subview"(%18, %arg60, %arg61) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %329 = "memref.subview"(%29, %arg60, %arg59) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%325, %324, %323) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg62: i32, %arg63: i32, %arg64: i32):
                %330 = "arith.muli"(%arg62, %arg63) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %331 = "arith.addi"(%arg64, %330) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%331) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      %134 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %135 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %136 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %137 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %138 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %139 = "amdaie.logicalobjectfifo.from_memref"(%29, %138, %136, %137, %135) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %140 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %141 = "amdaie.logicalobjectfifo.from_memref"(%318, %140) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %142 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %143 = "amdaie.logicalobjectfifo.from_memref"(%319, %142) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %144 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %145 = "amdaie.logicalobjectfifo.from_memref"(%320, %144) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %146 = "amdaie.core"(%134) ({
        %314 = "amdaie.logicalobjectfifo.access"(%145) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %315 = "amdaie.logicalobjectfifo.access"(%143) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %316 = "amdaie.logicalobjectfifo.access"(%141) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        %317 = "amdaie.logicalobjectfifo.access"(%139) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>) -> memref<1x1x8x8x4x4xi32, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%120) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%92) : (index) -> ()
        "linalg.fill"(%17, %317) <{operandSegmentSizes = array<i32: 1, 1>}> ({
        ^bb0(%arg57: i32, %arg58: i32):
          "linalg.yield"(%arg57) : (i32) -> ()
        }) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg51: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg52: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg53: index):
              %318 = "memref.subview"(%19, %arg53, %arg51) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %319 = "memref.subview"(%18, %arg52, %arg53) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %320 = "memref.subview"(%29, %arg52, %arg51) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%316, %315, %314) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg54: i32, %arg55: i32, %arg56: i32):
                %321 = "arith.muli"(%arg54, %arg55) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %322 = "arith.addi"(%arg56, %321) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%322) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      "scf.for"(%16, %1, %15) ({
      ^bb0(%arg26: index):
        %242 = "affine.apply"(%arg26) <{map = affine_map<(d0) -> (d0 + 1)>}> : (index) -> index
        %243 = "affine.apply"(%242) <{map = affine_map<(d0) -> (d0 * 32)>}> : (index) -> index
        %244 = "amdaie.dma_cpy_nd"(%28, %16, %16, %16, %16, %12, %15, %11, %11, %10, %10, %11, %15, %38, %16, %16, %48, %243, %12, %15, %11, %11, %9, %11, %8, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<128x256xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %245 = "amdaie.dma_cpy_nd"(%24, %16, %16, %16, %16, %15, %12, %11, %11, %7, %10, %11, %15, %42, %16, %16, %243, %47, %15, %12, %11, %11, %6, %11, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<256x128xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %246 = "amdaie.dma_cpy_nd"(%70, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %247 = "amdaie.dma_cpy_nd"(%60, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %15, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %248 = "amdaie.dma_cpy_nd"(%90, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %249 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %250 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %251 = "amdaie.logicalobjectfifo.from_memref"(%309, %250) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
        %252 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %253 = "amdaie.logicalobjectfifo.from_memref"(%310, %252) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
        %254 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %255 = "amdaie.logicalobjectfifo.from_memref"(%311, %254) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
        %256 = "amdaie.core"(%249) ({
          %306 = "amdaie.logicalobjectfifo.access"(%255) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %307 = "amdaie.logicalobjectfifo.access"(%253) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
          %308 = "amdaie.logicalobjectfifo.access"(%251) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
          "amdaie.logicalobjectfifo.consume"(%248) : (index) -> ()
          "amdaie.logicalobjectfifo.consume"(%246) : (index) -> ()
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg45: index):
            "scf.for"(%16, %14, %15) ({
            ^bb0(%arg46: index):
              "scf.for"(%16, %13, %15) ({
              ^bb0(%arg47: index):
                %309 = "memref.subview"(%19, %arg47, %arg45) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %310 = "memref.subview"(%18, %arg46, %arg47) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %311 = "memref.subview"(%29, %arg46, %arg45) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                "linalg.generic"(%308, %307, %306) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
                ^bb0(%arg48: i32, %arg49: i32, %arg50: i32):
                  %312 = "arith.muli"(%arg48, %arg49) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %313 = "arith.addi"(%arg50, %312) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "linalg.yield"(%313) : (i32) -> ()
                }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "amdaie.end"() : () -> ()
        }) : (index) -> index
        %257 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %258 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %259 = "amdaie.logicalobjectfifo.from_memref"(%301, %258) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
        %260 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %261 = "amdaie.logicalobjectfifo.from_memref"(%302, %260) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
        %262 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %263 = "amdaie.logicalobjectfifo.from_memref"(%303, %262) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
        %264 = "amdaie.core"(%257) ({
          %298 = "amdaie.logicalobjectfifo.access"(%263) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %299 = "amdaie.logicalobjectfifo.access"(%261) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
          %300 = "amdaie.logicalobjectfifo.access"(%259) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
          "amdaie.logicalobjectfifo.consume"(%248) : (index) -> ()
          "amdaie.logicalobjectfifo.consume"(%247) : (index) -> ()
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg39: index):
            "scf.for"(%16, %14, %15) ({
            ^bb0(%arg40: index):
              "scf.for"(%16, %13, %15) ({
              ^bb0(%arg41: index):
                %301 = "memref.subview"(%19, %arg41, %arg39) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %302 = "memref.subview"(%18, %arg40, %arg41) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %303 = "memref.subview"(%29, %arg40, %arg39) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                "linalg.generic"(%300, %299, %298) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
                ^bb0(%arg42: i32, %arg43: i32, %arg44: i32):
                  %304 = "arith.muli"(%arg42, %arg43) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %305 = "arith.addi"(%arg44, %304) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "linalg.yield"(%305) : (i32) -> ()
                }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "amdaie.end"() : () -> ()
        }) : (index) -> index
        %265 = "amdaie.dma_cpy_nd"(%80, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %15, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %266 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %267 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %268 = "amdaie.logicalobjectfifo.from_memref"(%293, %267) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
        %269 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %270 = "amdaie.logicalobjectfifo.from_memref"(%294, %269) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
        %271 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %272 = "amdaie.logicalobjectfifo.from_memref"(%295, %271) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
        %273 = "amdaie.core"(%266) ({
          %290 = "amdaie.logicalobjectfifo.access"(%272) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %291 = "amdaie.logicalobjectfifo.access"(%270) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
          %292 = "amdaie.logicalobjectfifo.access"(%268) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
          "amdaie.logicalobjectfifo.consume"(%265) : (index) -> ()
          "amdaie.logicalobjectfifo.consume"(%246) : (index) -> ()
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg33: index):
            "scf.for"(%16, %14, %15) ({
            ^bb0(%arg34: index):
              "scf.for"(%16, %13, %15) ({
              ^bb0(%arg35: index):
                %293 = "memref.subview"(%19, %arg35, %arg33) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %294 = "memref.subview"(%18, %arg34, %arg35) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %295 = "memref.subview"(%29, %arg34, %arg33) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                "linalg.generic"(%292, %291, %290) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
                ^bb0(%arg36: i32, %arg37: i32, %arg38: i32):
                  %296 = "arith.muli"(%arg36, %arg37) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %297 = "arith.addi"(%arg38, %296) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "linalg.yield"(%297) : (i32) -> ()
                }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "amdaie.end"() : () -> ()
        }) : (index) -> index
        %274 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %275 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %276 = "amdaie.logicalobjectfifo.from_memref"(%285, %275) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
        %277 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %278 = "amdaie.logicalobjectfifo.from_memref"(%286, %277) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
        %279 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %280 = "amdaie.logicalobjectfifo.from_memref"(%287, %279) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
        %281 = "amdaie.core"(%274) ({
          %282 = "amdaie.logicalobjectfifo.access"(%280) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %283 = "amdaie.logicalobjectfifo.access"(%278) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
          %284 = "amdaie.logicalobjectfifo.access"(%276) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
          "amdaie.logicalobjectfifo.consume"(%265) : (index) -> ()
          "amdaie.logicalobjectfifo.consume"(%247) : (index) -> ()
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg27: index):
            "scf.for"(%16, %14, %15) ({
            ^bb0(%arg28: index):
              "scf.for"(%16, %13, %15) ({
              ^bb0(%arg29: index):
                %285 = "memref.subview"(%19, %arg29, %arg27) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %286 = "memref.subview"(%18, %arg28, %arg29) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %287 = "memref.subview"(%29, %arg28, %arg27) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                "linalg.generic"(%284, %283, %282) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
                ^bb0(%arg30: i32, %arg31: i32, %arg32: i32):
                  %288 = "arith.muli"(%arg30, %arg31) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %289 = "arith.addi"(%arg32, %288) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "linalg.yield"(%289) : (i32) -> ()
                }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "amdaie.end"() : () -> ()
        }) : (index) -> index
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      %147 = "amdaie.dma_cpy_nd"(%28, %16, %16, %16, %16, %12, %15, %11, %11, %10, %10, %11, %15, %38, %16, %16, %48, %4, %12, %15, %11, %11, %9, %11, %8, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<128x256xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %148 = "amdaie.dma_cpy_nd"(%24, %16, %16, %16, %16, %15, %12, %11, %11, %7, %10, %11, %15, %42, %16, %16, %4, %47, %15, %12, %11, %11, %6, %11, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<256x128xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %149 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %150 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %151 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %152 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %153 = "amdaie.logicalobjectfifo.from_memref"(%29, %152, %150, %151, %149) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %154 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %155 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %156 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %157 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %158 = "amdaie.logicalobjectfifo.from_memref"(%29, %157, %155, %156, %154) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %159 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %160 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %161 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %162 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %163 = "amdaie.logicalobjectfifo.from_memref"(%29, %162, %160, %161, %159) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %164 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %165 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %166 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %167 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %168 = "amdaie.logicalobjectfifo.from_memref"(%29, %167, %165, %166, %164) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
      %169 = "amdaie.dma_cpy_nd"(%70, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %170 = "amdaie.dma_cpy_nd"(%55, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %15, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %171 = "amdaie.dma_cpy_nd"(%90, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %172 = "amdaie.dma_cpy_nd"(%34, %16, %16, %16, %16, %15, %15, %11, %11, %7, %10, %11, %15, %168, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %3, %13, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %173 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %174 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %175 = "amdaie.logicalobjectfifo.from_memref"(%237, %174) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %176 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %177 = "amdaie.logicalobjectfifo.from_memref"(%238, %176) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %178 = "amdaie.tile"(%16, %12) : (index, index) -> index
      %179 = "amdaie.logicalobjectfifo.from_memref"(%239, %178) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %180 = "amdaie.core"(%173) ({
        %234 = "amdaie.logicalobjectfifo.access"(%179) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %235 = "amdaie.logicalobjectfifo.access"(%177) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %236 = "amdaie.logicalobjectfifo.access"(%175) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%171) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%169) : (index) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg20: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg21: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg22: index):
              %237 = "memref.subview"(%19, %arg22, %arg20) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %238 = "memref.subview"(%18, %arg21, %arg22) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %239 = "memref.subview"(%29, %arg21, %arg20) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%236, %235, %234) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg23: i32, %arg24: i32, %arg25: i32):
                %240 = "arith.muli"(%arg23, %arg24) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %241 = "arith.addi"(%arg25, %240) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%241) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.logicalobjectfifo.produce"(%172) : (index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      %181 = "amdaie.dma_cpy_nd"(%34, %16, %15, %16, %16, %15, %15, %11, %11, %7, %10, %11, %15, %158, %16, %15, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %3, %13, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %182 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %183 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %184 = "amdaie.logicalobjectfifo.from_memref"(%229, %183) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %185 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %186 = "amdaie.logicalobjectfifo.from_memref"(%230, %185) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %187 = "amdaie.tile"(%15, %12) : (index, index) -> index
      %188 = "amdaie.logicalobjectfifo.from_memref"(%231, %187) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %189 = "amdaie.core"(%182) ({
        %226 = "amdaie.logicalobjectfifo.access"(%188) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %227 = "amdaie.logicalobjectfifo.access"(%186) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %228 = "amdaie.logicalobjectfifo.access"(%184) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%171) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%170) : (index) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg14: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg15: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg16: index):
              %229 = "memref.subview"(%19, %arg16, %arg14) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %230 = "memref.subview"(%18, %arg15, %arg16) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %231 = "memref.subview"(%29, %arg15, %arg14) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%228, %227, %226) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg17: i32, %arg18: i32, %arg19: i32):
                %232 = "arith.muli"(%arg17, %arg18) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %233 = "arith.addi"(%arg19, %232) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%233) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.logicalobjectfifo.produce"(%181) : (index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      %190 = "amdaie.dma_cpy_nd"(%75, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %15, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %191 = "amdaie.dma_cpy_nd"(%34, %15, %16, %16, %16, %15, %15, %11, %11, %7, %10, %11, %15, %163, %15, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %3, %13, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %192 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %193 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %194 = "amdaie.logicalobjectfifo.from_memref"(%221, %193) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %195 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %196 = "amdaie.logicalobjectfifo.from_memref"(%222, %195) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %197 = "amdaie.tile"(%16, %0) : (index, index) -> index
      %198 = "amdaie.logicalobjectfifo.from_memref"(%223, %197) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %199 = "amdaie.core"(%192) ({
        %218 = "amdaie.logicalobjectfifo.access"(%198) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %219 = "amdaie.logicalobjectfifo.access"(%196) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %220 = "amdaie.logicalobjectfifo.access"(%194) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%190) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%169) : (index) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg8: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg9: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg10: index):
              %221 = "memref.subview"(%19, %arg10, %arg8) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %222 = "memref.subview"(%18, %arg9, %arg10) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %223 = "memref.subview"(%29, %arg9, %arg8) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%220, %219, %218) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg11: i32, %arg12: i32, %arg13: i32):
                %224 = "arith.muli"(%arg11, %arg12) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %225 = "arith.addi"(%arg13, %224) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%225) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.logicalobjectfifo.produce"(%191) : (index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      %200 = "amdaie.dma_cpy_nd"(%34, %15, %15, %16, %16, %15, %15, %11, %11, %7, %10, %11, %15, %153, %15, %15, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %3, %13, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      %201 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %202 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %203 = "amdaie.logicalobjectfifo.from_memref"(%213, %202) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
      %204 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %205 = "amdaie.logicalobjectfifo.from_memref"(%214, %204) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
      %206 = "amdaie.tile"(%15, %0) : (index, index) -> index
      %207 = "amdaie.logicalobjectfifo.from_memref"(%215, %206) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
      %208 = "amdaie.core"(%201) ({
        %210 = "amdaie.logicalobjectfifo.access"(%207) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
        %211 = "amdaie.logicalobjectfifo.access"(%205) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
        %212 = "amdaie.logicalobjectfifo.access"(%203) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
        "amdaie.logicalobjectfifo.consume"(%190) : (index) -> ()
        "amdaie.logicalobjectfifo.consume"(%170) : (index) -> ()
        "scf.for"(%16, %14, %15) ({
        ^bb0(%arg2: index):
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg3: index):
            "scf.for"(%16, %13, %15) ({
            ^bb0(%arg4: index):
              %213 = "memref.subview"(%19, %arg4, %arg2) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
              %214 = "memref.subview"(%18, %arg3, %arg4) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
              %215 = "memref.subview"(%29, %arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
              "linalg.generic"(%212, %211, %210) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
              ^bb0(%arg5: i32, %arg6: i32, %arg7: i32):
                %216 = "arith.muli"(%arg5, %arg6) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                %217 = "arith.addi"(%arg7, %216) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                "linalg.yield"(%217) : (i32) -> ()
              }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "amdaie.logicalobjectfifo.produce"(%200) : (index) -> ()
        "amdaie.end"() : () -> ()
      }) : (index) -> index
      %209 = "amdaie.dma_cpy_nd"(%46, %48, %47, %2, %2, %5, %15, %34, %16, %16, %16, %16, %12, %11, %12, %11, %7, %11, %10, %15) <{operandSegmentSizes = array<i32: 1, 2, 2, 2, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
      "scf.forall.in_parallel"() ({
      ^bb0:
      }) : () -> ()
    }) : () -> ()
    "memref.dealloc"(%31) : (memref<2x2x32x32xi32, 1 : i32>) -> ()
    "memref.dealloc"(%25) : (memref<2x1x32x32xi32, 1 : i32>) -> ()
    "memref.dealloc"(%21) : (memref<1x2x32x32xi32, 1 : i32>) -> ()
    "memref.dealloc"(%19) : (memref<1x1x4x8x4x8xi32, 2 : i32>) -> ()
    "memref.dealloc"(%18) : (memref<1x1x8x4x8x4xi32, 2 : i32>) -> ()
    "memref.dealloc"(%29) : (memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
    "memref.dealloc"(%20) : (memref<1x2x32x32xi32, 1 : i32>) -> ()
    "memref.dealloc"(%30) : (memref<2x2x32x32xi32, 1 : i32>) -> ()
    "func.return"() : () -> ()
  }) {translation_info = #iree_codegen.translation_info<Custom>} : () -> ()
}) : () -> ()

<stdin>:3:5: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>
    hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
    ^
<stdin>:3:5: note: see current operation:
"hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg83: !hal.device):
    %350:3 = "flow.dispatch.workgroup_count_from_slice"() : () -> (index, index, index)
    "hal.return"(%350#0, %350#1, %350#2) : (index, index, index) -> ()
  }) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "matmul_i32_dispatch_0_matmul_128x128x256_i32"}> ({
      %0 = "arith.constant"() <{value = 3 : index}> : () -> index
      %1 = "arith.constant"() <{value = 6 : index}> : () -> index
      %2 = "arith.constant"() <{value = 64 : index}> : () -> index
      %3 = "arith.constant"() <{value = 16 : index}> : () -> index
      %4 = "arith.constant"() <{value = 224 : index}> : () -> index
      %5 = "arith.constant"() <{value = 128 : index}> : () -> index
      %6 = "arith.constant"() <{value = 4096 : index}> : () -> index
      %7 = "arith.constant"() <{value = 2048 : index}> : () -> index
      %8 = "arith.constant"() <{value = 256 : index}> : () -> index
      %9 = "arith.constant"() <{value = 8192 : index}> : () -> index
      %10 = "arith.constant"() <{value = 1024 : index}> : () -> index
      %11 = "arith.constant"() <{value = 32 : index}> : () -> index
      %12 = "arith.constant"() <{value = 2 : index}> : () -> index
      %13 = "arith.constant"() <{value = 4 : index}> : () -> index
      %14 = "arith.constant"() <{value = 8 : index}> : () -> index
      %15 = "arith.constant"() <{value = 1 : index}> : () -> index
      %16 = "arith.constant"() <{value = 0 : index}> : () -> index
      %17 = "arith.constant"() <{value = 0 : i32}> : () -> i32
      %18 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4x8x4xi32, 2 : i32>
      %19 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x4x8x4x8xi32, 2 : i32>
      %20 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x2x32x32xi32, 1 : i32>
      %21 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x2x32x32xi32, 1 : i32>
      %22 = "amdaie.tile"(%16, %15) : (index, index) -> index
      %23 = "amdaie.tile"(%15, %15) : (index, index) -> index
      %24 = "amdaie.logicalobjectfifo.from_memref"(%20, %22) : (memref<1x2x32x32xi32, 1 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
      %25 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<2x1x32x32xi32, 1 : i32>
      %26 = "amdaie.tile"(%16, %15) : (index, index) -> index
      %27 = "amdaie.tile"(%15, %15) : (index, index) -> index
      %28 = "amdaie.logicalobjectfifo.from_memref"(%20, %26) : (memref<1x2x32x32xi32, 1 : i32>, index) -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
      %29 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x8x4x4xi32, 2 : i32>
      %30 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<2x2x32x32xi32, 1 : i32>
      %31 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<2x2x32x32xi32, 1 : i32>
      %32 = "amdaie.tile"(%16, %15) : (index, index) -> index
      %33 = "amdaie.tile"(%15, %15) : (index, index) -> index
      %34 = "amdaie.logicalobjectfifo.from_memref"(%30, %32) : (memref<2x2x32x32xi32, 1 : i32>, index) -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
      %35 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<128x256xi32>
      %36 = "amdaie.tile"(%16, %16) : (index, index) -> index
      %37 = "amdaie.tile"(%15, %16) : (index, index) -> index
      %38 = "amdaie.logicalobjectfifo.from_memref"(%35, %36) : (memref<128x256xi32>, index) -> !amdaie.logicalobjectfifo<memref<128x256xi32>>
      "memref.assume_alignment"(%35) <{alignment = 64 : i32}> : (memref<128x256xi32>) -> ()
      %39 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<256x128xi32>
      %40 = "amdaie.tile"(%16, %16) : (index, index) -> index
      %41 = "amdaie.tile"(%15, %16) : (index, index) -> index
      %42 = "amdaie.logicalobjectfifo.from_memref"(%39, %40) : (memref<256x128xi32>, index) -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
      "memref.assume_alignment"(%39) <{alignment = 64 : i32}> : (memref<256x128xi32>) -> ()
      %43 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<128x128xi32>
      %44 = "amdaie.tile"(%16, %16) : (index, index) -> index
      %45 = "amdaie.tile"(%15, %16) : (index, index) -> index
      %46 = "amdaie.logicalobjectfifo.from_memref"(%43, %44) : (memref<128x128xi32>, index) -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
      "memref.assume_alignment"(%43) <{alignment = 64 : i32}> : (memref<128x128xi32>) -> ()
      "scf.forall"() <{mapping = [#gpu.block<y>, #gpu.block<x>], operandSegmentSizes = array<i32: 0, 0, 0, 0>, staticLowerBound = array<i64: 0, 0>, staticStep = array<i64: 1, 1>, staticUpperBound = array<i64: 2, 2>}> ({
      ^bb0(%arg0: index, %arg1: index):
        %47 = "affine.apply"(%arg1) <{map = affine_map<(d0) -> (d0 * 64)>}> : (index) -> index
        %48 = "affine.apply"(%arg0) <{map = affine_map<(d0) -> (d0 * 64)>}> : (index) -> index
        %49 = "amdaie.dma_cpy_nd"(%28, %16, %16, %16, %16, %12, %15, %11, %11, %10, %10, %11, %15, %38, %16, %16, %48, %16, %12, %15, %11, %11, %9, %11, %8, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<128x256xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %50 = "amdaie.dma_cpy_nd"(%24, %16, %16, %16, %16, %15, %12, %11, %11, %7, %10, %11, %15, %42, %16, %16, %16, %47, %15, %12, %11, %11, %6, %11, %5, %15) <{operandSegmentSizes = array<i32: 1, 4, 4, 4, 1, 4, 4, 4>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<256x128xi32>>, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %51 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %52 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %53 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %54 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %55 = "amdaie.logicalobjectfifo.from_memref"(%18, %54, %52, %53, %51) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
        %56 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %57 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %58 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %59 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %60 = "amdaie.logicalobjectfifo.from_memref"(%18, %59, %57, %58, %56) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
        %61 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %62 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %63 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %64 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %65 = "amdaie.logicalobjectfifo.from_memref"(%18, %64, %62, %63, %61) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
        %66 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %67 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %68 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %69 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %70 = "amdaie.logicalobjectfifo.from_memref"(%18, %69, %67, %68, %66) : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
        %71 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %72 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %73 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %74 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %75 = "amdaie.logicalobjectfifo.from_memref"(%19, %74, %72, %73, %71) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
        %76 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %77 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %78 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %79 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %80 = "amdaie.logicalobjectfifo.from_memref"(%19, %79, %77, %78, %76) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
        %81 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %82 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %83 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %84 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %85 = "amdaie.logicalobjectfifo.from_memref"(%19, %84, %82, %83, %81) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
        %86 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %87 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %88 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %89 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %90 = "amdaie.logicalobjectfifo.from_memref"(%19, %89, %87, %88, %86) : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
        %91 = "amdaie.dma_cpy_nd"(%70, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %92 = "amdaie.dma_cpy_nd"(%65, %16, %16, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %10, %10, %5, %11, %13, %15, %24, %16, %15, %16, %16, %16, %16, %15, %15, %14, %13, %14, %13, %7, %10, %13, %8, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %93 = "amdaie.dma_cpy_nd"(%90, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, source_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_sizes = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>, target_static_strides = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808>}> : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> index
        %94 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %95 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %96 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %97 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %98 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %99 = "amdaie.logicalobjectfifo.from_memref"(%29, %95, %97, %98, %96) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
        %100 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %101 = "amdaie.logicalobjectfifo.from_memref"(%345, %100) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
        %102 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %103 = "amdaie.logicalobjectfifo.from_memref"(%346, %102) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
        %104 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %105 = "amdaie.logicalobjectfifo.from_memref"(%347, %104) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
        %106 = "amdaie.core"(%94) ({
          %341 = "amdaie.logicalobjectfifo.access"(%105) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %342 = "amdaie.logicalobjectfifo.access"(%103) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
          %343 = "amdaie.logicalobjectfifo.access"(%101) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
          %344 = "amdaie.logicalobjectfifo.access"(%99) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>) -> memref<1x1x8x8x4x4xi32, 2 : i32>
          "amdaie.logicalobjectfifo.consume"(%93) : (index) -> ()
          "amdaie.logicalobjectfifo.consume"(%91) : (index) -> ()
          "linalg.fill"(%17, %344) <{operandSegmentSizes = array<i32: 1, 1>}> ({
          ^bb0(%arg81: i32, %arg82: i32):
            "linalg.yield"(%arg81) : (i32) -> ()
          }) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg75: index):
            "scf.for"(%16, %14, %15) ({
            ^bb0(%arg76: index):
              "scf.for"(%16, %13, %15) ({
              ^bb0(%arg77: index):
                %345 = "memref.subview"(%19, %arg77, %arg75) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %346 = "memref.subview"(%18, %arg76, %arg77) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %347 = "memref.subview"(%29, %arg76, %arg75) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                "linalg.generic"(%343, %342, %341) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
                ^bb0(%arg78: i32, %arg79: i32, %arg80: i32):
                  %348 = "arith.muli"(%arg78, %arg79) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %349 = "arith.addi"(%arg80, %348) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "linalg.yield"(%349) : (i32) -> ()
                }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "amdaie.end"() : () -> ()
        }) : (index) -> index
        %107 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %108 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %109 = "amdaie.tile"(%15, %0) : (index, index) -> index
        %110 = "amdaie.tile"(%16, %0) : (index, index) -> index
        %111 = "amdaie.tile"(%16, %12) : (index, index) -> index
        %112 = "amdaie.logicalobjectfifo.from_memref"(%29, %111, %110, %108, %109) : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index) -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
        %113 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %114 = "amdaie.logicalobjectfifo.from_memref"(%336, %113) : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>
        %115 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %116 = "amdaie.logicalobjectfifo.from_memref"(%337, %115) : (memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>
        %117 = "amdaie.tile"(%15, %12) : (index, index) -> index
        %118 = "amdaie.logicalobjectfifo.from_memref"(%338, %117) : (memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, index) -> !amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>
        %119 = "amdaie.core"(%107) ({
          %332 = "amdaie.logicalobjectfifo.access"(%118) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
          %333 = "amdaie.logicalobjectfifo.access"(%116) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
          %334 = "amdaie.logicalobjectfifo.access"(%114) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>>) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
          %335 = "amdaie.logicalobjectfifo.access"(%112) <{access_type = 0 : i32}> : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>) -> memref<1x1x8x8x4x4xi32, 2 : i32>
          "amdaie.logicalobjectfifo.consume"(%93) : (index) -> ()
          "amdaie.logicalobjectfifo.consume"(%92) : (index) -> ()
          "linalg.fill"(%17, %335) <{operandSegmentSizes = array<i32: 1, 1>}> ({
          ^bb0(%arg73: i32, %arg74: i32):
            "linalg.yield"(%arg73) : (i32) -> ()
          }) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
          "scf.for"(%16, %14, %15) ({
          ^bb0(%arg67: index):
            "scf.for"(%16, %14, %15) ({
            ^bb0(%arg68: index):
              "scf.for"(%16, %13, %15) ({
              ^bb0(%arg69: index):
                %336 = "memref.subview"(%19, %arg69, %arg67) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x4x8x4x8xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>
                %337 = "memref.subview"(%18, %arg68, %arg69) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x4x8x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>
                %338 = "memref.subview"(%29, %arg68, %arg67) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 1, 1, 1, 1, 4, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<1x1x8x8x4x4xi32, 2 : i32>, index, index) -> memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>
                "linalg.generic"(%334, %333, %332) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
                ^bb0(%arg70: i32, %arg71: i32, %arg72: i32):
                  %339 = "arith.muli"(%arg70, %arg71) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %340 = "arith.addi"(%arg72, %339) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "linalg.yield"(%340) : (i32) -> ()
                }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x1x1x4x8xi32, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x8x4xi32, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, memref<1x1x1x1x4x4xi32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "amdaie.end"() : () -> ()
        }) : (index) -> index
        %120 = "amdaie.dma_cpy_nd"(%85, %16, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %8, %11, %14, %15, %28, %15, %16, %16, %16, %16, %16, %15, %15, %13, %14, %13, %14, %10, %10, %14, %5, %11, %15) <{operandSegmentSizes = array<i32: 1, 6, 6, 6, 1, 6, 6, 6>, source_static_offsets = array<i64: -9223372036854775808, -9223372036854775808, -9223372036854775808, -9223372036854775808, -92233720368547758