Abhishek-Varma/failure_due_to_submodule_update.log Secret

## failure_due_to_submodule_update.log
// -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before TypePropagation (iree-codegen-type-propagation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before AMDAIELoweringStrategy (iree-amdaie-lowering-strategy) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before AMDAIELowerExecutableTarget (iree-amdaie-lower-executable-target) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %8 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%9 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
      %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
      %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<1x1x64x512xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %9 = tensor.empty() : tensor<1x1x64x512xi32>
    %10 = tensor.empty() : tensor<1x1x512x64xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %10 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %11 = tensor.empty() : tensor<1x1x64x64xi32>
    %pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_3 : tensor<1x1x64x64xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_4: i32, %out: i32):
      %13 = arith.muli %in, %in_4 : i32
      %14 = arith.addi %out, %13 : i32
      linalg.yield %14 : i32
    } -> tensor<1x1x64x64xi32>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<1x1x64x512xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %10 = tensor.empty() : tensor<1x1x64x512xi32>
    %11 = tensor.empty() : tensor<1x1x512x64xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %13 = tensor.empty() : tensor<1x1x64x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_6: i32, %out: i32):
      %16 = arith.muli %in, %in_6 : i32
      %17 = arith.addi %out, %16 : i32
      linalg.yield %17 : i32
    } -> tensor<1x1x64x64xi32>
    %unpack = tensor.unpack %15 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<1x1x64x512xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %10 = tensor.empty() : tensor<1x1x64x512xi32>
    %11 = tensor.empty() : tensor<1x1x512x64xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %13 = tensor.empty() : tensor<1x1x64x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_6: i32, %out: i32):
      %17 = arith.muli %in, %in_6 : i32
      %18 = arith.addi %out, %17 : i32
      linalg.yield %18 : i32
    } -> tensor<1x1x64x64xi32>
    %16 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %pack_5) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_6 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_6, %extracted_slice_7 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_8 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_9: i32, %out: i32):
        %18 = arith.muli %in, %in_9 : i32
        %19 = arith.addi %out, %18 : i32
        linalg.yield %19 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %16 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %9 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %10 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
      %12 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %11) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %14 = arith.muli %in, %in_8 : i32
          %15 = arith.addi %out, %14 : i32
          linalg.yield %15 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
      %11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %13 = arith.muli %in, %in_8 : i32
          %14 = arith.addi %out, %13 : i32
          linalg.yield %14 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEFuseFillIntoForall (iree-amdaie-fuse-fill-into-forall) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
    %11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_8: i32, %out: i32):
        %13 = arith.muli %in, %in_8 : i32
        %14 = arith.addi %out, %13 : i32
        linalg.yield %14 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
    %11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%12 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_9: i32, %out: i32):
        %14 = arith.muli %in, %in_9 : i32
        %15 = arith.addi %out, %14 : i32
        linalg.yield %15 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %13 = arith.muli %in, %in_8 : i32
          %14 = arith.addi %out, %13 : i32
          linalg.yield %14 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %13 = arith.muli %in, %in_8 : i32
          %14 = arith.addi %out, %13 : i32
          linalg.yield %14 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_8: i32, %out: i32):
        %13 = arith.muli %in, %in_8 : i32
        %14 = arith.addi %out, %13 : i32
        linalg.yield %14 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
      %13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %pack_10 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_10 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_12: i32, %out: i32):
        %19 = arith.muli %in, %in_12 : i32
        %20 = arith.addi %out, %19 : i32
        linalg.yield %20 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %unpack_11 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
      %13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_13: i32, %out: i32):
        %20 = arith.muli %in, %in_13 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %unpack_12 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
      %13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_14: i32, %out: i32):
        %21 = arith.muli %in, %in_14 : i32
        %22 = arith.addi %out, %21 : i32
        linalg.yield %22 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %c0_12 = arith.constant 0 : index
      %c64 = arith.constant 64 : index
      %c4 = arith.constant 4 : index
      %20 = scf.for %arg6 = %c0_12 to %c64 step %c4 iter_args(%arg7 = %pack_11) -> (tensor<1x1x8x8x4x4xi32>) {
        %extracted_slice_14 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_15 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %extracted_slice_16 = tensor.extract_slice %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_14, %extracted_slice_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_17: i32, %out: i32):
          %22 = arith.muli %in, %in_17 : i32
          %23 = arith.addi %out, %22 : i32
          linalg.yield %23 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        %inserted_slice = tensor.insert_slice %21 into %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<1x1x8x8x4x4xi32>
        scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_13 = tensor.unpack %20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
        %12 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
        %13 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
        %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %16 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %15) -> (tensor<1x1x8x8x4x4xi32>) {
          %extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_14: i32, %out: i32):
            %18 = arith.muli %in, %in_14 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_11 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
        %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
        %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
          %extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_14: i32, %out: i32):
            %17 = arith.muli %in, %in_14 : i32
            %18 = arith.addi %out, %17 : i32
            linalg.yield %18 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %16 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEFusePackIntoForLoop (iree-amdaie-fuse-pack-into-for) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %17 = arith.muli %in, %in_14 : i32
          %18 = arith.addi %out, %17 : i32
          linalg.yield %18 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.yield %16 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_13 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_13 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_15 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_16 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_17 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %pack_18 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_19 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_14, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_20: i32, %out: i32):
          %19 = arith.muli %in, %in_20 : i32
          %20 = arith.addi %out, %19 : i32
          linalg.yield %20 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.yield %18 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
          %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %19 = arith.muli %in, %in_16 : i32
            %20 = arith.addi %out, %19 : i32
            linalg.yield %20 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %18 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
          %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %19 = arith.muli %in, %in_16 : i32
            %20 = arith.addi %out, %19 : i32
            linalg.yield %20 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %18 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_16: i32, %out: i32):
          %18 = arith.muli %in, %in_16 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.yield %17 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIELowerToUKernels (iree-amdaie-lower-to-ukernels) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %alloc_12 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        %17 = bufferization.to_tensor %alloc_12 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_14 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_15 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %alloc_16 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        %18 = bufferization.to_tensor %alloc_16 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_17 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_18: i32, %out: i32):
          %20 = arith.muli %in, %in_18 : i32
          %21 = arith.addi %out, %20 : i32
          linalg.yield %21 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        memref.dealloc %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_16 : memref<1x1x8x4x8x4xi32, 2 : i32>
        scf.yield %19 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
          %14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %18 = arith.muli %in, %in_16 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
          %14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %18 = arith.muli %in, %in_16 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
          %14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %18 = arith.muli %in, %in_16 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        %3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_14: i32, %out: i32):
            %5 = arith.muli %in, %in_14 : i32
            %6 = arith.addi %out, %5 : i32
            linalg.yield %6 : i32
          }
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        %subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      %subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      %3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
        %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %5 = arith.muli %in, %in_14 : i32
          %6 = arith.addi %out, %5 : i32
          linalg.yield %6 : i32
        }
        memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
        scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      %subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    %subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  }
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      scf.for %arg4 = %c0 to %c64 step %c4 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_10 = memref.subview %subview_5[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %subview_6[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %4 = arith.muli %in, %in_14 : i32
          %5 = arith.addi %out, %4 : i32
          linalg.yield %5 : i32
        }
        memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %alloc_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      %subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    %subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      scf.for %arg4 = %c0 to %c64 step %c4 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_12: i32, %out: i32):
          %4 = arith.muli %in, %in_12 : i32
          %5 = arith.addi %out, %4 : i32
          linalg.yield %5 : i32
        }
        memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  return
}

// -----// IR Dump Before CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      scf.for %arg4 = %c0 to %c64 step %c4 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_12: i32, %out: i32):
          %4 = arith.muli %in, %in_12 : i32
          %5 = arith.addi %out, %4 : i32
          linalg.yield %5 : i32
        }
        memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  return
}

// -----// IR Dump Before LowerUKernelOpsToCalls (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_12: i32, %out: i32):
            %4 = arith.muli %in, %in_12 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_12: i32, %out: i32):
            %4 = arith.muli %in, %in_12 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c4 = arith.constant 4 : index
      %c64 = arith.constant 64 : index
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
        %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
        %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
        %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
          %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg4 = %c0 to %c64 step %c4 {
            %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
            %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_10: i32, %out: i32):
              %5 = arith.muli %in, %in_10 : i32
              %6 = arith.addi %out, %5 : i32
              linalg.yield %6 : i32
            }
            memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
          memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {mapping = [#gpu.block<y>, #gpu.block<x>]}
        iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      return
    }
  }
}

// -----// IR Dump Before CSE (cse) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c4 = arith.constant 4 : index
      %c64 = arith.constant 64 : index
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
        %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
        %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
        %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
          %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg4 = %c0 to %c64 step %c4 {
            %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
            %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_10: i32, %out: i32):
              %5 = arith.muli %in, %in_10 : i32
              %6 = arith.addi %out, %5 : i32
              linalg.yield %6 : i32
            }
            memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
          memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {mapping = [#gpu.block<y>, #gpu.block<x>]}
        iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      return
    }
  }
}

// -----// IR Dump Before AMDAIELowerWorkgroupCount (iree-amdaie-lower-workgroup-count) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c4 = arith.constant 4 : index
      %c64 = arith.constant 64 : index
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
        %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
        %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
        %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
          %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg4 = %c0 to %c64 step %c4 {
            %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
            %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_10: i32, %out: i32):
              %4 = arith.muli %in, %in_10 : i32
              %5 = arith.addi %out, %4 : i32
              linalg.yield %5 : i32
            }
            memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
          memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {mapping = [#gpu.block<y>, #gpu.block<x>]}
        iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      return
    }
  }
}

// -----// IR Dump Before EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before AMDAIEBridgeToAIR (iree-amdaie-bridge-to-air) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before AMDAIEDecomposeLinalgExtPackUnPackToAIR (iree-amdaie-decompose-pack-unpack-to-air) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.reduce
      }
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      scf.reduce
    }
    return
  }
}

// -----// IR Dump Before ParallelToHerd (air-par-to-herd) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
        %subview_5 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
          %subview_8 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %expand_shape = memref.expand_shape %subview_8 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
          %transpose_10 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_9[] [] [], %transpose_10[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
          %subview_11 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %expand_shape_13 = memref.expand_shape %subview_11 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
          %transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_12[] [] [], %transpose_14[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_15: i32, %out: i32):
            %4 = arith.muli %in, %in_15 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        %transpose_7 = memref.transpose %alloc_6 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
        air.dma_memcpy_nd (%subview_5[] [] [], %transpose_7[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
        memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.reduce
      }
      %subview_4 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
      %transpose = memref.transpose %subview_4 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
      air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      scf.reduce
    }
    return
  }
}

// -----// IR Dump Before ParallelToLaunch (air-par-to-launch) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %c2 = arith.constant 2 : index
      %c0_4 = arith.constant 0 : index
      %c1 = arith.constant 1 : index
      %c2_5 = arith.constant 2 : index
      %c0_6 = arith.constant 0 : index
      %c1_7 = arith.constant 1 : index
      %c2_8 = arith.constant 2 : index
      %c2_9 = arith.constant 2 : index
      air.herd @herd_0  tile (%arg2, %arg3) in (%arg4=%c2_8, %arg5=%c2_9) args(%arg6=%alloc_3, %arg7=%alloc, %arg8=%alloc_2) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
        %c0_i32_11 = arith.constant 0 : i32
        %c0_12 = arith.constant 0 : index
        %c64_13 = arith.constant 64 : index
        %c4_14 = arith.constant 4 : index
        %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
        %subview_15 = memref.subview %arg6[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_16 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32_11 : i32) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg9 = %c0_12 to %c64_13 step %c4_14 {
          %5 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
          %subview_18 = memref.subview %arg7[0, 0, %3, %5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_19 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %expand_shape = memref.expand_shape %subview_18 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
          %transpose_20 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_19[] [] [], %transpose_20[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
          %subview_21 = memref.subview %arg8[0, 0, %5, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_22 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %expand_shape_23 = memref.expand_shape %subview_21 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
          %transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_22[] [] [], %transpose_24[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_19, %alloc_22 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_25: i32, %out: i32):
            %6 = arith.muli %in, %in_25 : i32
            %7 = arith.addi %out, %6 : i32
            linalg.yield %7 : i32
          }
          memref.dealloc %alloc_19 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_22 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        %transpose_17 = memref.transpose %alloc_16 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
        air.dma_memcpy_nd (%subview_15[] [] [], %transpose_17[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
        memref.dealloc %alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>
        air.herd_terminator
      }
      %subview_10 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
      %transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
      air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      scf.reduce
    }
    return
  }
}

// -----// IR Dump Before CopyToDma (air-copy-to-dma) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    %c32_0 = arith.constant 32 : index
    %c0_1 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c32_2 = arith.constant 32 : index
    %c0_3 = arith.constant 0 : index
    %c1_4 = arith.constant 1 : index
    %c32_5 = arith.constant 32 : index
    %c32_6 = arith.constant 32 : index
    air.launch (%arg0, %arg1) in (%arg2=%c32_5, %arg3=%c32_6) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg7)
        %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg8)
        %subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
        %subview_7 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
        %subview_8 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
        %alloc_9 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc_9[] [] [], %subview_7[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
        %alloc_10 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        %c2 = arith.constant 2 : index
        %c0_11 = arith.constant 0 : index
        %c1_12 = arith.constant 1 : index
        %c2_13 = arith.constant 2 : index
        %c0_14 = arith.constant 0 : index
        %c1_15 = arith.constant 1 : index
        %c2_16 = arith.constant 2 : index
        %c2_17 = arith.constant 2 : index
        air.herd @herd_0  tile (%arg14, %arg15) in (%arg16=%c2_16, %arg17=%c2_17) args(%arg18=%alloc_10, %arg19=%alloc, %arg20=%alloc_9) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c0_i32_19 = arith.constant 0 : i32
          %c0_20 = arith.constant 0 : index
          %c64_21 = arith.constant 64 : index
          %c4_22 = arith.constant 4 : index
          %5 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg14)
          %6 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg15)
          %subview_23 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_24 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32_19 : i32) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg21 = %c0_20 to %c64_21 step %c4_22 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
            %subview_26 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_27 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            %expand_shape = memref.expand_shape %subview_26 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
            %transpose_28 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
            air.dma_memcpy_nd (%alloc_27[] [] [], %transpose_28[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
            %subview_29 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_30 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            %expand_shape_31 = memref.expand_shape %subview_29 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
            %transpose_32 = memref.transpose %expand_shape_31 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
            air.dma_memcpy_nd (%alloc_30[] [] [], %transpose_32[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_27, %alloc_30 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_33: i32, %out: i32):
              %8 = arith.muli %in, %in_33 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_27 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_30 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          %transpose_25 = memref.transpose %alloc_24 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
          air.dma_memcpy_nd (%subview_23[] [] [], %transpose_25[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
          memref.dealloc %alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        %subview_18 = memref.subview %alloc_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
        %transpose = memref.transpose %subview_18 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
        air.dma_memcpy_nd (%subview_8[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_9 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_10 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %c2 = arith.constant 2 : index
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
        %subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
        %subview_0 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
        %subview_1 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        %c0_2 = arith.constant 0 : index
        %c512 = arith.constant 512 : index
        %c1 = arith.constant 1 : index
        %c64 = arith.constant 64 : index
        %c512_3 = arith.constant 512 : index
        air.dma_memcpy_nd (%alloc[] [] [], %arg11[%3, %c0_2] [%c64, %c512_3] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %alloc_4 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        %c0_5 = arith.constant 0 : index
        %c2048 = arith.constant 2048 : index
        %c1_6 = arith.constant 1 : index
        %c512_7 = arith.constant 512 : index
        %c64_8 = arith.constant 64 : index
        air.dma_memcpy_nd (%alloc_4[] [] [], %arg12[%c0_5, %4] [%c512_7, %c64_8] [%c2048, %c1_6]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %alloc_9 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        air.herd @herd_0  tile (%arg14, %arg15) in (%arg16=%c2, %arg17=%c2) args(%arg18=%alloc_9, %arg19=%alloc, %arg20=%alloc_4) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg14]
          %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg15]
          %subview_28 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_29 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg21 = %c0_26 to %c64_27 step %c4 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
            %subview_55 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_56 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            %expand_shape = memref.expand_shape %subview_55 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
            %transpose_57 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
            %c0_58 = arith.constant 0 : index
            %c0_59 = arith.constant 0 : index
            %c32768 = arith.constant 32768 : index
            %c32768_60 = arith.constant 32768 : index
            %c8_61 = arith.constant 8 : index
            %c2048_62 = arith.constant 2048 : index
            %c512_63 = arith.constant 512 : index
            %c1_64 = arith.constant 1 : index
            %c1_65 = arith.constant 1 : index
            %c1_66 = arith.constant 1 : index
            %c4_67 = arith.constant 4 : index
            %c8_68 = arith.constant 8 : index
            %c4_69 = arith.constant 4 : index
            %c8_70 = arith.constant 8 : index
            %c0_71 = arith.constant 0 : index
            %c0_72 = arith.constant 0 : index
            air.dma_memcpy_nd (%alloc_56[] [] [], %arg19[%c0_72, %c0_71, %c0_58, %c0_59, %5, %7] [%c1_65, %c1_66, %c4_67, %c8_68, %c4_69, %c8_70] [%c32768, %c32768_60, %c8_61, %c2048_62, %c512_63, %c1_64]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %subview_73 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_74 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            %expand_shape_75 = memref.expand_shape %subview_73 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
            %transpose_76 = memref.transpose %expand_shape_75 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
            %c0_77 = arith.constant 0 : index
            %c0_78 = arith.constant 0 : index
            %c32768_79 = arith.constant 32768 : index
            %c32768_80 = arith.constant 32768 : index
            %c4_81 = arith.constant 4 : index
            %c512_82 = arith.constant 512 : index
            %c64_83 = arith.constant 64 : index
            %c1_84 = arith.constant 1 : index
            %c1_85 = arith.constant 1 : index
            %c1_86 = arith.constant 1 : index
            %c8_87 = arith.constant 8 : index
            %c4_88 = arith.constant 4 : index
            %c8_89 = arith.constant 8 : index
            %c4_90 = arith.constant 4 : index
            %c0_91 = arith.constant 0 : index
            %c0_92 = arith.constant 0 : index
            air.dma_memcpy_nd (%alloc_74[] [] [], %arg20[%c0_92, %c0_91, %c0_77, %c0_78, %7, %6] [%c1_85, %c1_86, %c8_87, %c4_88, %c8_89, %c4_90] [%c32768_79, %c32768_80, %c4_81, %c512_82, %c64_83, %c1_84]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_56, %alloc_74 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_93: i32, %out: i32):
              %8 = arith.muli %in, %in_93 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_56 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_74 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          %transpose_30 = memref.transpose %alloc_29 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
          %c0_31 = arith.constant 0 : index
          %c0_32 = arith.constant 0 : index
          %c1024 = arith.constant 1024 : index
          %c1024_33 = arith.constant 1024 : index
          %c16 = arith.constant 16 : index
          %c4_34 = arith.constant 4 : index
          %c128 = arith.constant 128 : index
          %c1_35 = arith.constant 1 : index
          %c1_36 = arith.constant 1 : index
          %c1_37 = arith.constant 1 : index
          %c8 = arith.constant 8 : index
          %c4_38 = arith.constant 4 : index
          %c8_39 = arith.constant 8 : index
          %c4_40 = arith.constant 4 : index
          %c4096_41 = arith.constant 4096 : index
          %c4096_42 = arith.constant 4096 : index
          %c64_43 = arith.constant 64 : index
          %c1_44 = arith.constant 1 : index
          %c1_45 = arith.constant 1 : index
          %c1_46 = arith.constant 1 : index
          %c32_47 = arith.constant 32 : index
          %c32_48 = arith.constant 32 : index
          %c0_49 = arith.constant 0 : index
          %c0_50 = arith.constant 0 : index
          %c0_51 = arith.constant 0 : index
          %c0_52 = arith.constant 0 : index
          %c0_53 = arith.constant 0 : index
          %c0_54 = arith.constant 0 : index
          air.dma_memcpy_nd (%arg18[%c0_31, %c0_32, %5, %6] [%c1_45, %c1_46, %c32_47, %c32_48] [%c4096_41, %c4096_42, %c64_43, %c1_44], %alloc_29[%c0_54, %c0_53, %c0_52, %c0_51, %c0_50, %c0_49] [%c1_36, %c1_37, %c8, %c4_38, %c8_39, %c4_40] [%c1024, %c1024_33, %c16, %c4_34, %c128, %c1_35]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          memref.dealloc %alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        %subview_10 = memref.subview %alloc_9[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
        %transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
        %c0_11 = arith.constant 0 : index
        %c0_12 = arith.constant 0 : index
        %c0_13 = arith.constant 0 : index
        %c0_14 = arith.constant 0 : index
        %c64_15 = arith.constant 64 : index
        %c1_16 = arith.constant 1 : index
        %c64_17 = arith.constant 64 : index
        %c64_18 = arith.constant 64 : index
        %c2048_19 = arith.constant 2048 : index
        %c1_20 = arith.constant 1 : index
        %c64_21 = arith.constant 64 : index
        %c64_22 = arith.constant 64 : index
        %c1_23 = arith.constant 1 : index
        %c1_24 = arith.constant 1 : index
        %c4096 = arith.constant 4096 : index
        %c4096_25 = arith.constant 4096 : index
        air.dma_memcpy_nd (%arg13[%3, %4] [%c64_21, %c64_22] [%c2048_19, %c1_20], %alloc_9[%c0_11, %c0_12, %c0_13, %c0_14] [%c1_24, %c1_23, %c64_17, %c64_18] [%c4096_25, %c4096, %c64_15, %c1_16]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_4 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_9 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_0 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        air.herd @herd_0  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c32_3 = arith.constant 32 : index
          %c4096_4 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_5 = arith.constant 1 : index
          %c512_6 = arith.constant 512 : index
          %c2048_7 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_8 = arith.constant 0 : index
          %c64_9 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
          %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
          %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
            %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_13: i32, %out: i32):
              %8 = arith.muli %in, %in_13 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRDependency (air-dependency) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_0 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        air.herd @herd_0  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c32_3 = arith.constant 32 : index
          %c4096_4 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_5 = arith.constant 1 : index
          %c512_6 = arith.constant 512 : index
          %c2048_7 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_8 = arith.constant 0 : index
          %c64_9 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
          %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
          %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
            %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_13: i32, %out: i32):
              %8 = arith.muli %in, %in_13 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRDependencyScheduleOpt (air-dependency-schedule-opt) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %1 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_7 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_8, %results_9 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %6 : index
        } {id = 7 : i32}
        %async_token_10, %results_11 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %6 : index
        } {id = 8 : i32}
        %async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %4 = air.herd @herd_0 async [%async_token_12, %async_token_14, %async_token_16]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_21 = arith.constant 32 : index
          %c4096_22 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_23 = arith.constant 1 : index
          %c512_24 = arith.constant 512 : index
          %c2048_25 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %async_token_28, %results_29 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %9 : index
          } {id = 12 : i32}
          %async_token_30, %results_31 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %9 : index
          } {id = 13 : i32}
          %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_34 = air.execute [%async_token_32] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34]  {id = 2 : i32}
          %7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
            %c0_36 = arith.constant 0 : index
            %c1_37 = arith.constant 1 : index
            %c4_38 = arith.constant 4 : index
            %c8_39 = arith.constant 8 : index
            %c32768_40 = arith.constant 32768 : index
            %c2048_41 = arith.constant 2048 : index
            %c512_42 = arith.constant 512 : index
            %c64_43 = arith.constant 64 : index
            %async_token_44, %results_45 = air.execute [%arg20] -> (index) {
              %12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %12 : index
            } {id = 16 : i32}
            %async_token_46, %results_47 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %9 = air.dma_memcpy_nd async [%async_token_46, %async_token_44, %arg20] (%results_47[] [] [], %arg17[%c0_36, %c0_36, %c0_36, %c0_36, %results_29, %results_45] [%c1_37, %c1_37, %c4_38, %c8_39, %c4_38, %c8_39] [%c32768_40, %c32768_40, %c8_39, %c2048_41, %c512_42, %c1_37]) {id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %async_token_48, %results_49 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %10 = air.dma_memcpy_nd async [%async_token_48, %async_token_44, %arg20] (%results_49[] [] [], %arg18[%c0_36, %c0_36, %c0_36, %c0_36, %results_45, %results_31] [%c1_37, %c1_37, %c8_39, %c4_38, %c8_39, %c4_38] [%c32768_40, %c32768_40, %c4_38, %c512_42, %c64_43, %c1_37]) {id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            %async_token_50 = air.execute [%10, %9, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_47, %results_49 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_53: i32, %out: i32):
                %12 = arith.muli %in, %in_53 : i32
                %13 = arith.addi %out, %12 : i32
                linalg.yield %13 : i32
              }
            } {id = 19 : i32}
            %async_token_51 = air.execute [%async_token_50] {
              memref.dealloc %results_47 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_52 = air.execute [%async_token_50] {
              memref.dealloc %results_49 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %11 = air.wait_all async [%arg20, %async_token_50]  {id = 1 : i32}
            scf.yield %11 : !air.async.token
          }
          %8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_35 = air.execute [%8] {
            memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %5 = air.dma_memcpy_nd async [%async_token_8, %async_token_10, %4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        %async_token_18 = air.execute [%4, %2] {
          memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_19 = air.execute [%4, %3] {
          memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_20 = air.execute [%5] {
          memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRSpecializeDmaBroadcast (air-specialize-dma-broadcast) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %1 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_7 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_8, %results_9 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %6 : index
        } {id = 7 : i32}
        %async_token_10, %results_11 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %6 : index
        } {id = 8 : i32}
        %async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %4 = air.herd @herd_0 async [%async_token_12, %async_token_14, %async_token_16]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_21 = arith.constant 32 : index
          %c4096_22 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_23 = arith.constant 1 : index
          %c512_24 = arith.constant 512 : index
          %c2048_25 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %async_token_28, %results_29 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %9 : index
          } {id = 12 : i32}
          %async_token_30, %results_31 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %9 : index
          } {id = 13 : i32}
          %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_34 = air.execute [%async_token_32] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34]  {id = 2 : i32}
          %7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
            %async_token_36, %results_37 = air.execute [%arg20] -> (index) {
              %12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %12 : index
            } {id = 16 : i32}
            %async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %9 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_26, %c0_26, %c0_26, %c0_26, %results_29, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 - s0 == 0, d1 >= 0, -d1 + 1 >= 0, s0 >= 0, -s0 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %10 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_26, %c0_26, %c0_26, %c0_26, %results_37, %results_31] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 >= 0, -d0 + 1 >= 0, d1 - s0 == 0, s0 >= 0, -s0 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            %async_token_42 = air.execute [%10, %9, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_45: i32, %out: i32):
                %12 = arith.muli %in, %in_45 : i32
                %13 = arith.addi %out, %12 : i32
                linalg.yield %13 : i32
              }
            } {id = 19 : i32}
            %async_token_43 = air.execute [%async_token_42] {
              memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_44 = air.execute [%async_token_42] {
              memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %11 = air.wait_all async [%arg20, %async_token_42]  {id = 1 : i32}
            scf.yield %11 : !air.async.token
          }
          %8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_35 = air.execute [%8] {
            memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %5 = air.dma_memcpy_nd async [%async_token_8, %async_token_10, %4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        %async_token_18 = air.execute [%4, %2] {
          memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_19 = air.execute [%4, %3] {
          memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_20 = air.execute [%5] {
          memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before DmaToChannel (air-dma-to-channel) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %1 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_7 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_8, %results_9 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %6 : index
        } {id = 7 : i32}
        %async_token_10, %results_11 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %6 : index
        } {id = 8 : i32}
        %async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %4 = air.herd @herd_0 async [%async_token_12, %async_token_14, %async_token_16]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_21 = arith.constant 32 : index
          %c4096_22 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_23 = arith.constant 1 : index
          %c512_24 = arith.constant 512 : index
          %c2048_25 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %async_token_28, %results_29 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %9 : index
          } {id = 12 : i32}
          %async_token_30, %results_31 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %9 : index
          } {id = 13 : i32}
          %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_34 = air.execute [%async_token_32] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34]  {id = 2 : i32}
          %7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
            %async_token_36, %results_37 = air.execute [%arg20] -> (index) {
              %12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %12 : index
            } {id = 16 : i32}
            %async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %9 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_45 = arith.constant 0 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c0_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            } else {
              %c32_45 = arith.constant 32 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c32_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 - 1 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            }
            %async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %10 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_45 = arith.constant 0 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c0_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>, id = 5 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            } else {
              %c32_45 = arith.constant 32 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c32_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 - 1 == 0)>, id = 6 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            }
            %async_token_42 = air.execute [%10, %9, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_45: i32, %out: i32):
                %12 = arith.muli %in, %in_45 : i32
                %13 = arith.addi %out, %12 : i32
                linalg.yield %13 : i32
              }
            } {id = 19 : i32}
            %async_token_43 = air.execute [%async_token_42] {
              memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_44 = air.execute [%async_token_42] {
              memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %11 = air.wait_all async [%arg20, %async_token_42]  {id = 1 : i32}
            scf.yield %11 : !air.async.token
          }
          %8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 7 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_35 = air.execute [%8] {
            memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %5 = air.dma_memcpy_nd async [%async_token_8, %async_token_10, %4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 8 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        %async_token_18 = air.execute [%4, %2] {
          memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_19 = air.execute [%4, %3] {
          memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_20 = air.execute [%5] {
          memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
      %c2048 = arith.constant 2048 : index
      %c64_10 = arith.constant 64 : index
      %c1_11 = arith.constant 1 : index
      %c512_12 = arith.constant 512 : index
      %c0_13 = arith.constant 0 : index
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %2 = air.channel.put async [%async_token_14]  @channel_5[] (%arg5[%c0_13, %results_15] [%c512_12, %c64_10] [%c2048, %c1_11]) : (memref<512x2048xi32>)
      %c2048_16 = arith.constant 2048 : index
      %c64_17 = arith.constant 64 : index
      %c1_18 = arith.constant 1 : index
      %async_token_19, %results_20 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %async_token_21, %results_22 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %3 = air.channel.get async [%async_token_19, %async_token_21]  @channel_7[] (%arg6[%results_20, %results_22] [%c64_17, %c64_17] [%c2048_16, %c1_18]) : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048_23 = arith.constant 2048 : index
        %c64_24 = arith.constant 64 : index
        %c1_25 = arith.constant 1 : index
        %c512_26 = arith.constant 512 : index
        %c0_27 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_28, %results_29 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %23 : index
        } {id = 7 : i32}
        %async_token_30, %results_31 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %23 : index
        } {id = 8 : i32}
        %async_token_32, %results_33 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %5 = air.channel.get async [%async_token_28, %async_token_32]  @channel_4[] (%results_33[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_34, %results_35 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %6 = air.channel.get async [%async_token_30, %async_token_34]  @channel_5[] (%results_35[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_36, %results_37 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %c0_38 = arith.constant 0 : index
        %c1_39 = arith.constant 1 : index
        %c512_40 = arith.constant 512 : index
        %c2048_41 = arith.constant 2048 : index
        %c8 = arith.constant 8 : index
        %c32768 = arith.constant 32768 : index
        %c0_42 = arith.constant 0 : index
        %c64_43 = arith.constant 64 : index
        %c4 = arith.constant 4 : index
        %async_token_44, %results_45 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_46, %results_47 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_48, %results_49 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %7 = air.wait_all async
        %8 = air.wait_all async [%async_token_32, %async_token_44, %async_token_46, %7]  {id = 2 : i32}
        %9 = scf.for %arg12 = %c0_42 to %c64_43 step %c4 iter_args(%arg13 = %8) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c4_100 = arith.constant 4 : index
          %c8_101 = arith.constant 8 : index
          %c32768_102 = arith.constant 32768 : index
          %c2048_103 = arith.constant 2048 : index
          %c512_104 = arith.constant 512 : index
          %c64_105 = arith.constant 64 : index
          %async_token_106, %results_107 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c0_108 = arith.constant 0 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %c0_112 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_106, %async_token_32, %arg13]  @channel_0[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c0_108, %results_107] [%c1_39, %c1_39, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_41, %c512_40, %c1_39]) : (memref<1x1x64x512xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c1_50 = arith.constant 1 : index
        %c512_51 = arith.constant 512 : index
        %c2048_52 = arith.constant 2048 : index
        %c8_53 = arith.constant 8 : index
        %c32768_54 = arith.constant 32768 : index
        %c0_55 = arith.constant 0 : index
        %c64_56 = arith.constant 64 : index
        %c4_57 = arith.constant 4 : index
        %async_token_58, %results_59 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_60, %results_61 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_62, %results_63 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %10 = air.wait_all async
        %11 = air.wait_all async [%async_token_32, %async_token_58, %async_token_60, %10]  {id = 2 : i32}
        %12 = scf.for %arg12 = %c0_55 to %c64_56 step %c4_57 iter_args(%arg13 = %11) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c4_100 = arith.constant 4 : index
          %c8_101 = arith.constant 8 : index
          %c32768_102 = arith.constant 32768 : index
          %c2048_103 = arith.constant 2048 : index
          %c512_104 = arith.constant 512 : index
          %c64_105 = arith.constant 64 : index
          %async_token_106, %results_107 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c32_108 = arith.constant 32 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %c0_112 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_106, %async_token_32, %arg13]  @channel_1[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c32_108, %results_107] [%c1_50, %c1_50, %c4_57, %c8_53, %c4_57, %c8_53] [%c32768_54, %c32768_54, %c8_53, %c2048_52, %c512_51, %c1_50]) : (memref<1x1x64x512xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c0_64 = arith.constant 0 : index
        %c1_65 = arith.constant 1 : index
        %c512_66 = arith.constant 512 : index
        %c8_67 = arith.constant 8 : index
        %c32768_68 = arith.constant 32768 : index
        %c0_69 = arith.constant 0 : index
        %c64_70 = arith.constant 64 : index
        %c4_71 = arith.constant 4 : index
        %async_token_72, %results_73 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_74, %results_75 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_76, %results_77 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %13 = air.wait_all async
        %14 = air.wait_all async [%async_token_34, %async_token_72, %async_token_74, %13]  {id = 2 : i32}
        %15 = scf.for %arg12 = %c0_69 to %c64_70 step %c4_71 iter_args(%arg13 = %14) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c8_100 = arith.constant 8 : index
          %c4_101 = arith.constant 4 : index
          %c32768_102 = arith.constant 32768 : index
          %c512_103 = arith.constant 512 : index
          %c64_104 = arith.constant 64 : index
          %async_token_105, %results_106 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c0_107 = arith.constant 0 : index
          %c0_108 = arith.constant 0 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_105, %async_token_34, %arg13]  @channel_2[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c0_107] [%c1_65, %c1_65, %c8_67, %c4_71, %c8_67, %c4_71] [%c32768_68, %c32768_68, %c4_71, %c512_66, %c64_70, %c1_65]) : (memref<1x1x512x64xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c1_78 = arith.constant 1 : index
        %c512_79 = arith.constant 512 : index
        %c8_80 = arith.constant 8 : index
        %c32768_81 = arith.constant 32768 : index
        %c0_82 = arith.constant 0 : index
        %c64_83 = arith.constant 64 : index
        %c4_84 = arith.constant 4 : index
        %async_token_85, %results_86 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_87, %results_88 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_89, %results_90 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %16 = air.wait_all async
        %17 = air.wait_all async [%async_token_34, %async_token_85, %async_token_87, %16]  {id = 2 : i32}
        %18 = scf.for %arg12 = %c0_82 to %c64_83 step %c4_84 iter_args(%arg13 = %17) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c8_100 = arith.constant 8 : index
          %c4_101 = arith.constant 4 : index
          %c32768_102 = arith.constant 32768 : index
          %c512_103 = arith.constant 512 : index
          %c64_104 = arith.constant 64 : index
          %async_token_105, %results_106 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c32_107 = arith.constant 32 : index
          %c0_108 = arith.constant 0 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_105, %async_token_34, %arg13]  @channel_3[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c32_107] [%c1_78, %c1_78, %c8_80, %c4_84, %c8_80, %c4_84] [%c32768_81, %c32768_81, %c4_84, %c512_79, %c64_83, %c1_78]) : (memref<1x1x512x64xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c1_91 = arith.constant 1 : index
        %c0_92 = arith.constant 0 : index
        %c0_93 = arith.constant 0 : index
        %c2_94 = arith.constant 2 : index
        %c2_95 = arith.constant 2 : index
        %19 = air.wait_all async [%async_token_36]
        %20 = scf.parallel (%arg12, %arg13) = (%c0_92, %c0_93) to (%c2_94, %c2_95) step (%c1_91, %c1_91) init (%19) -> !air.async.token {
          %c32_99 = arith.constant 32 : index
          %c4096_100 = arith.constant 4096 : index
          %c1_101 = arith.constant 1 : index
          %c0_102 = arith.constant 0 : index
          %c64_103 = arith.constant 64 : index
          %async_token_104, %results_105 = air.execute -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 12 : i32}
          %async_token_106, %results_107 = air.execute -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %25 : index
          } {id = 13 : i32}
          %23 = air.channel.get async [%async_token_106, %async_token_104, %async_token_36, %19]  @channel_6[%arg12, %arg13] (%results_37[%c0_102, %c0_102, %results_105, %results_107] [%c1_101, %c1_101, %c32_99, %c32_99] [%c4096_100, %c4096_100, %c64_103, %c1_101]) : (memref<1x1x64x64xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.reduce(%24 : !air.async.token) {
          ^bb0(%arg14: !air.async.token, %arg15: !air.async.token):
            %25 = air.wait_all async [%arg14, %arg15]
            scf.reduce.return %25 : !air.async.token
          }
        }
        %21 = air.herd @herd_0 async [%async_token_32, %async_token_34, %async_token_36]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_37, %arg17=%results_33, %arg18=%results_35) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_99 = arith.constant 32 : index
          %c4096_100 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_101 = arith.constant 1 : index
          %c512_102 = arith.constant 512 : index
          %c2048_103 = arith.constant 2048 : index
          %c8_104 = arith.constant 8 : index
          %c32768_105 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_106 = arith.constant 0 : index
          %c64_107 = arith.constant 64 : index
          %c4_108 = arith.constant 4 : index
          %async_token_109, %results_110 = air.execute -> (index) {
            %26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %26 : index
          } {id = 12 : i32}
          %async_token_111, %results_112 = air.execute -> (index) {
            %26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %26 : index
          } {id = 13 : i32}
          %async_token_113, %results_114 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_115 = air.execute [%async_token_113] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %23 = air.wait_all async [%async_token_109, %async_token_111, %async_token_115]  {id = 2 : i32}
          %24 = scf.for %arg19 = %c0_106 to %c64_107 step %c4_108 iter_args(%arg20 = %23) -> (!air.async.token) {
            %async_token_117, %results_118 = air.execute [%arg20] -> (index) {
              %29 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %29 : index
            } {id = 16 : i32}
            %async_token_119, %results_120 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %26 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_126 = arith.constant 0 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_119, %async_token_117, %arg20]  @channel_0[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            } else {
              %c32_126 = arith.constant 32 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_119, %async_token_117, %arg20]  @channel_1[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            }
            %async_token_121, %results_122 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %27 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_126 = arith.constant 0 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_121, %async_token_117, %arg20]  @channel_2[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            } else {
              %c32_126 = arith.constant 32 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_121, %async_token_117, %arg20]  @channel_3[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            }
            %async_token_123 = air.execute [%27, %26, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_120, %results_122 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_126: i32, %out: i32):
                %29 = arith.muli %in, %in_126 : i32
                %30 = arith.addi %out, %29 : i32
                linalg.yield %30 : i32
              }
            } {id = 19 : i32}
            %async_token_124 = air.execute [%async_token_123] {
              memref.dealloc %results_120 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_125 = air.execute [%async_token_123] {
              memref.dealloc %results_122 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %28 = air.wait_all async [%arg20, %async_token_123]  {id = 1 : i32}
            scf.yield %28 : !air.async.token
          }
          %25 = air.channel.put async [%24]  @channel_6[%arg12, %arg13] (%results_114[%c0_106, %c0_106, %c0_106, %c0_106, %c0_106, %c0_106] [%c1_101, %c1_101, %c8_104, %c4_108, %c8_104, %c4_108] [%c1024, %c1024, %c16, %c4_108, %c128, %c1_101]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_116 = air.execute [%25] {
            memref.dealloc %results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %22 = air.channel.put async [%async_token_28, %async_token_30, %21]  @channel_7[] (%results_37[%c0_27, %c0_27, %c0_27, %c0_27] [%c1_25, %c1_25, %c64_24, %c64_24] [%c4096, %c4096, %c64_24, %c1_25]) : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_96 = air.execute [%21, %5] {
          memref.dealloc %results_33 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_97 = air.execute [%21, %6] {
          memref.dealloc %results_35 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_98 = air.execute [%22] {
          memref.dealloc %results_37 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %c2048 = arith.constant 2048 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
      %async_token_10, %results_11 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %2 = air.channel.put async [%async_token_10]  @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
      %async_token_12, %results_13 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %3 = air.channel.get async [%async_token_12, %async_token_14]  @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
        %c32_16 = arith.constant 32 : index
        %c4 = arith.constant 4 : index
        %c32768 = arith.constant 32768 : index
        %c8 = arith.constant 8 : index
        %c4096 = arith.constant 4096 : index
        %c2048_17 = arith.constant 2048 : index
        %c64_18 = arith.constant 64 : index
        %c1_19 = arith.constant 1 : index
        %c512_20 = arith.constant 512 : index
        %c0_21 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_22, %results_23 = air.execute -> (index) {
          %14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %14 : index
        } {id = 7 : i32}
        %async_token_24, %results_25 = air.execute -> (index) {
          %14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %14 : index
        } {id = 8 : i32}
        %async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %5 = air.channel.get async [%async_token_22, %async_token_26]  @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %6 = air.channel.get async [%async_token_24, %async_token_28]  @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %7 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %9 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %11 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
          %async_token_43, %results_44 = air.execute -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 12 : i32}
          %async_token_45, %results_46 = air.execute -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %15 : index
          } {id = 13 : i32}
          %14 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30]  @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
          scf.reduce(%14 : !air.async.token) {
          ^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
            %15 = air.wait_all async [%arg11, %arg12]
            scf.reduce.return %15 : !air.async.token
          }
        }
        %12 = air.herd @herd_0 async [%async_token_26, %async_token_28, %async_token_30]  tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_43 = arith.constant 1 : index
          %c8_44 = arith.constant 8 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_45 = arith.constant 0 : index
          %c64_46 = arith.constant 64 : index
          %c4_47 = arith.constant 4 : index
          %async_token_48, %results_49 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %17 : index
          } {id = 12 : i32}
          %async_token_50, %results_51 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %17 : index
          } {id = 13 : i32}
          %async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_54 = air.execute [%async_token_52] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %14 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54]  {id = 2 : i32}
          %15 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %14) -> (!air.async.token) {
            %async_token_56, %results_57 = air.execute [%arg14] -> (index) {
              %20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
              air.execute_terminator %20 : index
            } {id = 16 : i32}
            %async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %17 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
              %20 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %18 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
              %20 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_62 = air.execute [%18, %17, %arg14] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_65: i32, %out: i32):
                %20 = arith.muli %in, %in_65 : i32
                %21 = arith.addi %out, %20 : i32
                linalg.yield %21 : i32
              }
            } {id = 19 : i32}
            %async_token_63 = air.execute [%async_token_62] {
              memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_64 = air.execute [%async_token_62] {
              memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %19 = air.wait_all async [%arg14, %async_token_62]  {id = 1 : i32}
            scf.yield %19 : !air.async.token
          }
          %16 = air.channel.put async [%15]  @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_55 = air.execute [%16] {
            memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %13 = air.channel.put async [%async_token_22, %async_token_24, %12]  @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_40 = air.execute [%12, %5] {
          memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_41 = air.execute [%12, %6] {
          memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_42 = air.execute [%13] {
          memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRDependencyCanonicalize (air-dependency-canonicalize) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %c2048 = arith.constant 2048 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
      %async_token_10, %results_11 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %2 = air.channel.put async [%async_token_10]  @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
      %async_token_12, %results_13 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %3 = air.channel.get async [%async_token_12, %async_token_14]  @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
        %c32_16 = arith.constant 32 : index
        %c4 = arith.constant 4 : index
        %c32768 = arith.constant 32768 : index
        %c8 = arith.constant 8 : index
        %c4096 = arith.constant 4096 : index
        %c2048_17 = arith.constant 2048 : index
        %c64_18 = arith.constant 64 : index
        %c1_19 = arith.constant 1 : index
        %c512_20 = arith.constant 512 : index
        %c0_21 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_22, %results_23 = air.execute -> (index) {
          %14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %14 : index
        } {id = 7 : i32}
        %async_token_24, %results_25 = air.execute -> (index) {
          %14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %14 : index
        } {id = 8 : i32}
        %async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %5 = air.channel.get async [%async_token_22, %async_token_26]  @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %6 = air.channel.get async [%async_token_24, %async_token_28]  @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %7 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %9 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 16 : i32}
          %14 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %14 : !air.async.token
        }
        %11 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
          %async_token_43, %results_44 = air.execute -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %15 : index
          } {id = 12 : i32}
          %async_token_45, %results_46 = air.execute -> (index) {
            %15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %15 : index
          } {id = 13 : i32}
          %14 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30]  @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
          scf.reduce(%14 : !air.async.token) {
          ^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
            %15 = air.wait_all async [%arg11, %arg12]
            scf.reduce.return %15 : !air.async.token
          }
        }
        %12 = air.herd @herd_0 async [%async_token_26, %async_token_28, %async_token_30]  tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_43 = arith.constant 1 : index
          %c8_44 = arith.constant 8 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_45 = arith.constant 0 : index
          %c64_46 = arith.constant 64 : index
          %c4_47 = arith.constant 4 : index
          %async_token_48, %results_49 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %17 : index
          } {id = 12 : i32}
          %async_token_50, %results_51 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %17 : index
          } {id = 13 : i32}
          %async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_54 = air.execute [%async_token_52] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %14 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54]  {id = 2 : i32}
          %15 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %14) -> (!air.async.token) {
            %async_token_56, %results_57 = air.execute [%arg14] -> (index) {
              %20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
              air.execute_terminator %20 : index
            } {id = 16 : i32}
            %async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %17 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
              %20 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %18 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
              %20 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_62 = air.execute [%18, %17, %arg14] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_65: i32, %out: i32):
                %20 = arith.muli %in, %in_65 : i32
                %21 = arith.addi %out, %20 : i32
                linalg.yield %21 : i32
              }
            } {id = 19 : i32}
            %async_token_63 = air.execute [%async_token_62] {
              memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_64 = air.execute [%async_token_62] {
              memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %19 = air.wait_all async [%arg14, %async_token_62]  {id = 1 : i32}
            scf.yield %19 : !air.async.token
          }
          %16 = air.channel.put async [%15]  @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_55 = air.execute [%16] {
            memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %13 = air.channel.put async [%async_token_22, %async_token_24, %12]  @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_40 = air.execute [%12, %5] {
          memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_41 = air.execute [%12, %6] {
          memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_42 = air.execute [%13] {
          memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    }
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    }
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    }
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    }
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    }
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    }
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 1 : i32} {
      %c2048 = arith.constant 2048 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      }
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<2048x512xi32>)
      %async_token_10, %results_11 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      }
      %2 = air.channel.put async [%async_token_10]  @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<512x2048xi32>)
      %async_token_12, %results_13 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      }
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      }
      %3 = air.channel.get async [%async_token_12, %async_token_14]  @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) {id = 3 : i32} : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
        %c32_16 = arith.constant 32 : index
        %c4 = arith.constant 4 : index
        %c32768 = arith.constant 32768 : index
        %c8 = arith.constant 8 : index
        %c4096 = arith.constant 4096 : index
        %c2048_17 = arith.constant 2048 : index
        %c64_18 = arith.constant 64 : index
        %c1_19 = arith.constant 1 : index
        %c512_20 = arith.constant 512 : index
        %c0_21 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %5 = air.wait_all async
        %6 = air.wait_all async
        %async_token_22, %results_23 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        }
        %7 = air.channel.get async [%5, %async_token_22]  @channel_4[] (%results_23[] [] []) {id = 4 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_24, %results_25 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        }
        %8 = air.channel.get async [%6, %async_token_24]  @channel_5[] (%results_25[] [] []) {id = 5 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_26, %results_27 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        }
        %9 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_22) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg10] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_0[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 6 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_22) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg10] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_1[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 7 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %11 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_24) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg10] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_2[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 8 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %12 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_24) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg10] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_3[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 9 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %13 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_26) -> !air.async.token {
          %async_token_31, %results_32 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %17 : index
          }
          %async_token_33, %results_34 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.get async [%async_token_26, %async_token_33, %async_token_31]  @channel_6[%arg9, %arg10] (%results_27[%c0_21, %c0_21, %results_32, %results_34] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) {id = 10 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
          scf.reduce(%16 : !air.async.token) {
          ^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
            %17 = air.wait_all async [%arg11, %arg12]
            scf.reduce.return %17 : !air.async.token
          }
        }
        %14 = air.herd @herd_0 async [%async_token_22, %async_token_24, %async_token_26]  tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 3 : i32} {
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_31 = arith.constant 1 : index
          %c8_32 = arith.constant 8 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_33 = arith.constant 0 : index
          %c64_34 = arith.constant 64 : index
          %c4_35 = arith.constant 4 : index
          %async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          }
          %async_token_38 = air.execute [%async_token_36] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          }
          %16 = scf.for %arg13 = %c0_33 to %c64_34 step %c4_35 iter_args(%arg14 = %async_token_38) -> (!air.async.token) {
            %async_token_40, %results_41 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            }
            %18 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
              %20 = air.channel.get async [%async_token_40, %arg14]  @channel_0[%arg9, %arg10] (%results_41[] [] []) {id = 11 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_40, %arg14]  @channel_1[%arg9, %arg10] (%results_41[] [] []) {id = 12 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_42, %results_43 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            }
            %19 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
              %20 = air.channel.get async [%async_token_42, %arg14]  @channel_2[%arg9, %arg10] (%results_43[] [] []) {id = 13 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_42, %arg14]  @channel_3[%arg9, %arg10] (%results_43[] [] []) {id = 14 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_44 = air.execute [%19, %18] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_41, %results_43 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_47: i32, %out: i32):
                %20 = arith.muli %in, %in_47 : i32
                %21 = arith.addi %out, %20 : i32
                linalg.yield %21 : i32
              }
            }
            %async_token_45 = air.execute [%async_token_44] {
              memref.dealloc %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>
            }
            %async_token_46 = air.execute [%async_token_44] {
              memref.dealloc %results_43 : memref<1x1x8x4x8x4xi32, 2 : i32>
            }
            scf.yield %async_token_44 : !air.async.token
          }
          %17 = air.channel.put async [%16]  @channel_6[%arg9, %arg10] (%results_37[%c0_33, %c0_33, %c0_33, %c0_33, %c0_33, %c0_33] [%c1_31, %c1_31, %c8_32, %c4_35, %c8_32, %c4_35] [%c1024, %c1024, %c16, %c4_35, %c128, %c1_31]) {id = 15 : i32} : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_39 = air.execute [%17] {
            memref.dealloc %results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>
          }
          air.herd_terminator
        }
        %15 = air.channel.put async [%5, %6, %14]  @channel_7[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) {id = 16 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_28 = air.execute [%7] {
          memref.dealloc %results_23 : memref<1x1x64x512xi32, 1 : i32>
        }
        %async_token_29 = air.execute [%8] {
          memref.dealloc %results_25 : memref<1x1x512x64xi32, 1 : i32>
        }
        %async_token_30 = air.execute [%15] {
          memref.dealloc %results_27 : memref<1x1x64x64xi32, 1 : i32>
        }
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    }
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    }
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    }
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    }
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    }
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    }
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 1 : i32} {
      %c2048 = arith.constant 2048 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      }
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<2048x512xi32>)
      %async_token_10, %results_11 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      }
      %2 = air.channel.put async [%async_token_10]  @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<512x2048xi32>)
      %async_token_12, %results_13 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      }
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      }
      %3 = air.channel.get async [%async_token_12, %async_token_14]  @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) {id = 3 : i32} : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  attributes {id = 2 : i32} {
        %c32_16 = arith.constant 32 : index
        %c4 = arith.constant 4 : index
        %c32768 = arith.constant 32768 : index
        %c8 = arith.constant 8 : index
        %c4096 = arith.constant 4096 : index
        %c2048_17 = arith.constant 2048 : index
        %c64_18 = arith.constant 64 : index
        %c1_19 = arith.constant 1 : index
        %c512_20 = arith.constant 512 : index
        %c0_21 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %5 = air.wait_all async
        %6 = air.wait_all async
        %async_token_22, %results_23 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        }
        %7 = air.channel.get async [%5, %async_token_22]  @channel_4[] (%results_23[] [] []) {id = 4 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_24, %results_25 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        }
        %8 = air.channel.get async [%6, %async_token_24]  @channel_5[] (%results_25[] [] []) {id = 5 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_26, %results_27 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        }
        %9 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_0[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 6 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %10 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_1[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 7 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %11 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_2[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 8 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %12 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
          %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.put async [%async_token_31]  @channel_3[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 9 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %16 : !air.async.token
        }
        %13 = scf.parallel (%arg7, %arg8) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_26) -> !air.async.token {
          %async_token_31, %results_32 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg7]
            air.execute_terminator %17 : index
          }
          %async_token_33, %results_34 = air.execute -> (index) {
            %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg8]
            air.execute_terminator %17 : index
          }
          %16 = air.channel.get async [%async_token_26, %async_token_33, %async_token_31]  @channel_6[%arg7, %arg8] (%results_27[%c0_21, %c0_21, %results_32, %results_34] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) {id = 10 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
          scf.reduce(%16 : !air.async.token) {
          ^bb0(%arg9: !air.async.token, %arg10: !air.async.token):
            %17 = air.wait_all async [%arg9, %arg10]
            scf.reduce.return %17 : !air.async.token
          }
        }
        %14 = air.herd @herd_0 async [%async_token_22, %async_token_24, %async_token_26]  tile (%arg7, %arg8) in (%arg9=%c2, %arg10=%c2) attributes {id = 3 : i32} {
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_31 = arith.constant 1 : index
          %c8_32 = arith.constant 8 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_33 = arith.constant 0 : index
          %c64_34 = arith.constant 64 : index
          %c4_35 = arith.constant 4 : index
          %async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          }
          %async_token_38 = air.execute [%async_token_36] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          }
          %16 = scf.for %arg11 = %c0_33 to %c64_34 step %c4_35 iter_args(%arg12 = %async_token_38) -> (!air.async.token) {
            %async_token_40, %results_41 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            }
            %18 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg7, %arg8] -> !air.async.token {
              %20 = air.channel.get async [%async_token_40, %arg12]  @channel_0[%arg7, %arg8] (%results_41[] [] []) {id = 11 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_40, %arg12]  @channel_1[%arg7, %arg8] (%results_41[] [] []) {id = 12 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_42, %results_43 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            }
            %19 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg7, %arg8] -> !air.async.token {
              %20 = air.channel.get async [%async_token_42, %arg12]  @channel_2[%arg7, %arg8] (%results_43[] [] []) {id = 13 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            } else {
              %20 = air.channel.get async [%async_token_42, %arg12]  @channel_3[%arg7, %arg8] (%results_43[] [] []) {id = 14 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %20 : !air.async.token
            }
            %async_token_44 = air.execute [%19, %18] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_41, %results_43 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_47: i32, %out: i32):
                %20 = arith.muli %in, %in_47 : i32
                %21 = arith.addi %out, %20 : i32
                linalg.yield %21 : i32
              }
            }
            %async_token_45 = air.execute [%async_token_44] {
              memref.dealloc %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>
            }
            %async_token_46 = air.execute [%async_token_44] {
              memref.dealloc %results_43 : memref<1x1x8x4x8x4xi32, 2 : i32>
            }
            scf.yield %async_token_44 : !air.async.token
          }
          %17 = air.channel.put async [%16]  @channel_6[%arg7, %arg8] (%results_37[%c0_33, %c0_33, %c0_33, %c0_33, %c0_33, %c0_33] [%c1_31, %c1_31, %c8_32, %c4_35, %c8_32, %c4_35] [%c1024, %c1024, %c16, %c4_35, %c128, %c1_31]) {id = 15 : i32} : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_39 = air.execute [%17] {
            memref.dealloc %results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>
          }
          air.herd_terminator
        }
        %15 = air.channel.put async [%5, %6, %14]  @channel_7[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) {id = 16 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_28 = air.execute [%7] {
          memref.dealloc %results_23 : memref<1x1x64x512xi32, 1 : i32>
        }
        %async_token_29 = air.execute [%8] {
          memref.dealloc %results_25 : memref<1x1x512x64xi32, 1 : i32>
        }
        %async_token_30 = air.execute [%15] {
          memref.dealloc %results_27 : memref<1x1x64x64xi32, 1 : i32>
        }
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRSegmentLoopFusion (air-loop-fusion) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c32 = arith.constant 32 : index
  %c0 = arith.constant 0 : index
  %async_token, %results = air.execute -> (memref<2048x512xi32>) {
    %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    air.execute_terminator %1 : memref<2048x512xi32>
  }
  %async_token_0 = air.execute [%async_token] {
    memref.assume_alignment %results, 64 : memref<2048x512xi32>
  }
  %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    air.execute_terminator %1 : memref<512x2048xi32>
  }
  %async_token_3 = air.execute [%async_token_1] {
    memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
  }
  %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
    %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    air.execute_terminator %1 : memref<2048x2048xi32>
  }
  %async_token_6 = air.execute [%async_token_4] {
    memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
  }
  %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 1 : i32} {
    %c2048 = arith.constant 2048 : index
    %c64 = arith.constant 64 : index
    %c1 = arith.constant 1 : index
    %c512 = arith.constant 512 : index
    %c0_7 = arith.constant 0 : index
    %async_token_8, %results_9 = air.execute -> (index) {
      %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
      air.execute_terminator %5 : index
    }
    %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<2048x512xi32>)
    %async_token_10, %results_11 = air.execute -> (index) {
      %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
      air.execute_terminator %5 : index
    }
    %2 = air.channel.put async [%async_token_10]  @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<512x2048xi32>)
    %async_token_12, %results_13 = air.execute -> (index) {
      %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
      air.execute_terminator %5 : index
    }
    %async_token_14, %results_15 = air.execute -> (index) {
      %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
      air.execute_terminator %5 : index
    }
    %3 = air.channel.get async [%async_token_12, %async_token_14]  @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) {id = 3 : i32} : (memref<2048x2048xi32>)
    %4 = air.segment @segment_0 async  attributes {id = 2 : i32} {
      %c32_16 = arith.constant 32 : index
      %c4 = arith.constant 4 : index
      %c32768 = arith.constant 32768 : index
      %c8 = arith.constant 8 : index
      %c4096 = arith.constant 4096 : index
      %c2048_17 = arith.constant 2048 : index
      %c64_18 = arith.constant 64 : index
      %c1_19 = arith.constant 1 : index
      %c512_20 = arith.constant 512 : index
      %c0_21 = arith.constant 0 : index
      %c2 = arith.constant 2 : index
      %5 = air.wait_all async
      %6 = air.wait_all async
      %async_token_22, %results_23 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
      }
      %7 = air.channel.get async [%5, %async_token_22]  @channel_4[] (%results_23[] [] []) {id = 4 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
      %async_token_24, %results_25 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
        %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
      }
      %8 = air.channel.get async [%6, %async_token_24]  @channel_5[] (%results_25[] [] []) {id = 5 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
      %async_token_26, %results_27 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
        %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
      }
      %9 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
        %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
          %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
          air.execute_terminator %17 : index
        }
        %16 = air.channel.put async [%async_token_31]  @channel_0[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 6 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
        scf.yield %16 : !air.async.token
      }
      %10 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
        %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
          %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
          air.execute_terminator %17 : index
        }
        %16 = air.channel.put async [%async_token_31]  @channel_1[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 7 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
        scf.yield %16 : !air.async.token
      }
      %11 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
        %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
          %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
          air.execute_terminator %17 : index
        }
        %16 = air.channel.put async [%async_token_31]  @channel_2[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 8 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
        scf.yield %16 : !air.async.token
      }
      %12 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
        %async_token_31, %results_32 = air.execute [%arg8] -> (index) {
          %17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
          air.execute_terminator %17 : index
        }
        %16 = air.channel.put async [%async_token_31]  @channel_3[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 9 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
        scf.yield %16 : !air.async.token
      }
      %13 = scf.parallel (%arg7, %arg8) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_26) -> !air.async.token {
        %async_token_31, %results_32 = air.execute -> (index) {
          %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg7]
          air.execute_terminator %17 : index
        }
        %async_token_33, %results_34 = air.execute -> (index) {
          %17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg8]
          air.execute_terminator %17 : index
        }
        %16 = air.channel.get async [%async_token_26, %async_token_33, %async_token_31]  @channel_6[%arg7, %arg8] (%results_27[%c0_21, %c0_21, %results_32, %results_34] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) {id = 10 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
        scf.reduce(%16 : !air.async.token) {
        ^bb0(%arg9: !air.async.token, %arg10: !air.async.token):
          %17 = air.wait_all async [%arg9, %arg10]
          scf.reduce.return %17 : !air.async.token
        }
      }
      %14 = air.herd @herd_0 async [%async_token_22, %async_token_24, %async_token_26]  tile (%arg7, %arg8) in (%arg9=%c2, %arg10=%c2) attributes {id = 3 : i32} {
        %c128 = arith.constant 128 : index
        %c16 = arith.constant 16 : index
        %c1024 = arith.constant 1024 : index
        %c1_31 = arith.constant 1 : index
        %c8_32 = arith.constant 8 : index
        %c0_i32 = arith.constant 0 : i32
        %c0_33 = arith.constant 0 : index
        %c64_34 = arith.constant 64 : index
        %c4_35 = arith.constant 4 : index
        %async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        }
        %async_token_38 = air.execute [%async_token_36] {
          linalg.fill ins(%c0_i32 : i32) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        }
        %16 = scf.for %arg11 = %c0_33 to %c64_34 step %c4_35 iter_args(%arg12 = %async_token_38) -> (!air.async.token) {
          %async_token_40, %results_41 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
          }
          %18 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg7, %arg8] -> !air.async.token {
            %20 = air.channel.get async [%async_token_40, %arg12]  @channel_0[%arg7, %arg8] (%results_41[] [] []) {id = 11 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
            affine.yield %20 : !air.async.token
          } else {
            %20 = air.channel.get async [%async_token_40, %arg12]  @channel_1[%arg7, %arg8] (%results_41[] [] []) {id = 12 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
            affine.yield %20 : !air.async.token
          }
          %async_token_42, %results_43 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          %19 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg7, %arg8] -> !air.async.token {
            %20 = air.channel.get async [%async_token_42, %arg12]  @channel_2[%arg7, %arg8] (%results_43[] [] []) {id = 13 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
            affine.yield %20 : !air.async.token
          } else {
            %20 = air.channel.get async [%async_token_42, %arg12]  @channel_3[%arg7, %arg8] (%results_43[] [] []) {id = 14 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
            affine.yield %20 : !air.async.token
          }
          %async_token_44 = air.execute [%19, %18] {
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_41, %results_43 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_47: i32, %out: i32):
              %20 = arith.muli %in, %in_47 : i32
              %21 = arith.addi %out, %20 : i32
              linalg.yield %21 : i32
            }
          }
          %async_token_45 = air.execute [%async_token_44] {
            memref.dealloc %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>
          }
          %async_token_46 = air.execute [%async_token_44] {
            memref.dealloc %results_43 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          scf.yield %async_token_44 : !air.async.token
        }
        %17 = air.channel.put async [%16]  @channel_6[%arg7, %arg8] (%results_37[%c0_33, %c0_33, %c0_33, %c0_33, %c0_33, %c0_33] [%c1_31, %c1_31, %c8_32, %c4_35, %c8_32, %c4_35] [%c1024, %c1024, %c16, %c4_35, %c128, %c1_31]) {id = 15 : i32} : (memref<1x1x8x8x4x4xi32, 2 : i32>)
        %async_token_39 = air.execute [%17] {
          memref.dealloc %results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>
        }
        air.herd_terminator
      }
      %15 = air.channel.put async [%5, %6, %14]  @channel_7[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) {id = 16 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
      %async_token_28 = air.execute [%7] {
        memref.dealloc %results_23 : memref<1x1x64x512xi32, 1 : i32>
      }
      %async_token_29 = air.execute [%8] {
        memref.dealloc %results_25 : memref<1x1x512x64xi32, 1 : i32>
      }
      %async_token_30 = air.execute [%15] {
        memref.dealloc %results_27 : memref<1x1x64x64xi32, 1 : i32>
      }
      air.segment_terminator
    }
    air.launch_terminator
  }
  return
}

<stdin>:20:16: error: operand #0 does not dominate this use
          %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
               ^
<stdin>:20:16: note: see current operation: %13 = "air.channel.get"(%22#0, %11, %22#1) <{chan_name = @channel_4, operandSegmentSizes = array<i32: 2, 0, 1, 0, 0, 0>}> {id = 4 : i32} : (!air.async.token, !air.async.token, memref<1x1x64x512xi32, 1 : i32>) -> !air.async.token
<stdin>:20:16: note: operand defined here (op in a child region)
<stdin>:3:5: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>
    hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
    ^
<stdin>:3:5: note: see current operation:
"hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg0: !hal.device):
    %0 = "arith.constant"() <{value = 1 : index}> : () -> index
    %1 = "arith.constant"() <{value = 1 : index}> : () -> index
    %2 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%0, %1, %2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "matmul_large_dispatch_0_matmul_2048x2048x512_i32", translation_info = #iree_codegen.translation_info<None>} : () -> ()
  "builtin.module"() ({
    "air.channel"() <{size = [1, 1], sym_name = "channel_7"}> : () -> ()
    "air.channel"() <{size = [2, 2], sym_name = "channel_6"}> : () -> ()
    "air.channel"() <{size = [1, 1], sym_name = "channel_5"}> : () -> ()
    "air.channel"() <{size = [1, 1], sym_name = "channel_4"}> : () -> ()
    "air.channel"() <{size = [1, 1], sym_name = "channel_3"}> {broadcast_shape = [2, 1]} : () -> ()
    "air.channel"() <{size = [1, 1], sym_name = "channel_2"}> {broadcast_shape = [2, 1]} : () -> ()
    "air.channel"() <{size = [1, 1], sym_name = "channel_1"}> {broadcast_shape = [1, 2]} : () -> ()
    "air.channel"() <{size = [1, 1], sym_name = "channel_0"}> {broadcast_shape = [1, 2]} : () -> ()
    "func.func"() <{function_type = () -> (), sym_name = "matmul_large_dispatch_0_matmul_2048x2048x512_i32"}> ({
      %0 = "arith.constant"() <{value = 32 : index}> : () -> index
      %1 = "arith.constant"() <{value = 0 : index}> : () -> index
      %2:2 = "air.execute"() ({
        %9 = "hal.interface.binding.subspan"(%1) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<2048x512xi32>
        "air.execute_terminator"(%9) : (memref<2048x512xi32>) -> ()
      }) : () -> (!air.async.token, memref<2048x512xi32>)
      %3 = "air.execute"(%2#0) ({
        "memref.assume_alignment"(%2#1) <{alignment = 64 : i32}> : (memref<2048x512xi32>) -> ()
        "air.execute_terminator"() : () -> ()
      }) : (!air.async.token) -> !air.async.token
      %4:2 = "air.execute"() ({
        %9 = "hal.interface.binding.subspan"(%1) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<512x2048xi32>
        "air.execute_terminator"(%9) : (memref<512x2048xi32>) -> ()
      }) : () -> (!air.async.token, memref<512x2048xi32>)
      %5 = "air.execute"(%4#0) ({
        "memref.assume_alignment"(%4#1) <{alignment = 64 : i32}> : (memref<512x2048xi32>) -> ()
        "air.execute_terminator"() : () -> ()
      }) : (!air.async.token) -> !air.async.token
      %6:2 = "air.execute"() ({
        %9 = "hal.interface.binding.subspan"(%1) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<2048x2048xi32>
        "air.execute_terminator"(%9) : (memref<2048x2048xi32>) -> ()
      }) : () -> (!air.async.token, memref<2048x2048xi32>)
      %7 = "air.execute"(%6#0) ({
        "memref.assume_alignment"(%6#1) <{alignment = 64 : i32}> : (memref<2048x2048xi32>) -> ()
        "air.execute_terminator"() : () -> ()
      }) : (!air.async.token) -> !air.async.token
      %8 = "air.launch"(%7, %5, %3, %0, %0, %2#1, %4#1, %6#1) <{operandSegmentSizes = array<i32: 3, 2, 3>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: memref<2048x512xi32>, %arg5: memref<512x2048xi32>, %arg6: memref<2048x2048xi32>):
        %9 = "arith.constant"() <{value = 2048 : index}> : () -> index
        %10 = "arith.constant"() <{value = 64 : index}> : () -> index
        %11 = "arith.constant"() <{value = 1 : index}> : () -> index
        %12 = "arith.constant"() <{value = 512 : index}> : () -> index
        %13 = "arith.constant"() <{value = 0 : index}> : () -> index
        %14:2 = "air.execute"() ({
          %22 = "affine.apply"(%arg0) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
          "air.execute_terminator"(%22) : (index) -> ()
        }) : () -> (!air.async.token, index)
        %15 = "air.channel.put"(%14#0, %arg4, %14#1, %13, %10, %12, %12, %11) <{chan_name = @channel_4, operandSegmentSizes = array<i32: 1, 0, 1, 2, 2, 2>}> {id = 1 : i32} : (!air.async.token, memref<2048x512xi32>, index, index, index, index, index, index) -> !air.async.token
        %16:2 = "air.execute"() ({
          %22 = "affine.apply"(%arg1) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
          "air.execute_terminator"(%22) : (index) -> ()
        }) : () -> (!air.async.token, index)
        %17 = "air.channel.put"(%16#0, %arg5, %13, %16#1, %12, %10, %9, %11) <{chan_name = @channel_5, operandSegmentSizes = array<i32: 1, 0, 1, 2, 2, 2>}> {id = 2 : i32} : (!air.async.token, memref<512x2048xi32>, index, index, index, index, index, index) -> !air.async.token
        %18:2 = "air.execute"() ({
          %22 = "affine.apply"(%arg0) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
          "air.execute_terminator"(%22) : (index) -> ()
        }) : () -> (!air.async.token, index)
        %19:2 = "air.execute"() ({
          %22 = "affine.apply"(%arg1) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
          "air.execute_terminator"(%22) : (index) -> ()
        }) : () -> (!air.async.token, index)
        %20 = "air.channel.get"(%19#0, %18#0, %arg6, %18#1, %19#1, %10, %10, %9, %11) <{chan_name = @channel_7, operandSegmentSizes = array<i32: 2, 0, 1, 2, 2, 2>}> {id = 3 : i32} : (!air.async.token, !air.async.token, memref<2048x2048xi32>, index, index, index, index, index, index) -> !air.async.token
        %21 = "air.segment"() <{operandSegmentSizes = array<i32: 0, 0, 0>, sym_name = "segment_0"}> ({
          %22 = "arith.constant"() <{value = 32 : index}> : () -> index
          %23 = "arith.constant"() <{value = 4 : index}> : () -> index
          %24 = "arith.constant"() <{value = 32768 : index}> : () -> index
          %25 = "arith.constant"() <{value = 8 : index}> : () -> index
          %26 = "arith.constant"() <{value = 4096 : index}> : () -> index
          %27 = "arith.constant"() <{value = 2048 : index}> : () -> index
          %28 = "arith.constant"() <{value = 64 : index}> : () -> index
          %29 = "arith.constant"() <{value = 1 : index}> : () -> index
          %30 = "arith.constant"() <{value = 512 : index}> : () -> index
          %31 = "arith.constant"() <{value = 0 : index}> : () -> index
          %32 = "arith.constant"() <{value = 2 : index}> : () -> index
          %33 = "air.wait_all"() : () -> !air.async.token
          %34 = "air.wait_all"() : () -> !air.async.token
          %35 = "air.channel.get"(%44#0, %33, %44#1) <{chan_name = @channel_4, operandSegmentSizes = array<i32: 2, 0, 1, 0, 0, 0>}> {id = 4 : i32} : (!air.async.token, !air.async.token, memref<1x1x64x512xi32, 1 : i32>) -> !air.async.token
          %36 = "air.channel.get"(%45#0, %34, %45#1) <{chan_name = @channel_5, operandSegmentSizes = array<i32: 2, 0, 1, 0, 0, 0>}> {id = 5 : i32} : (!air.async.token, !air.async.token, memref<1x1x512x64xi32, 1 : i32>) -> !air.async.token
          %37 = "scf.for"(%31, %30, %22, %36) ({
          ^bb0(%arg7: index, %arg8: !air.async.token):
            %44:2 = "air.execute"() ({
              %53 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x64x512xi32, 1 : i32>
              "air.execute_terminator"(%53) : (memref<1x1x64x512xi32, 1 : i32>) -> ()
            }) : () -> (!air.async.token, memref<1x1x64x512xi32, 1 : i32>)
            %45:2 = "air.execute"() ({
              %53 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x512x64xi32, 1 : i32>
              "air.execute_terminator"(%53) : (memref<1x1x512x64xi32, 1 : i32>) -> ()
            }) : () -> (!air.async.token, memref<1x1x512x64xi32, 1 : i32>)
            %46 = "air.channel.put"(%arg8, %44#1, %31, %31, %31, %31, %31, %arg7, %29, %29, %23, %25, %23, %25, %24, %24, %25, %27, %30, %29) <{chan_name = @channel_0, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 6 : i32} : (!air.async.token, memref<1x1x64x512xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
            %47 = "air.channel.put"(%arg8, %44#1, %31, %31, %31, %31, %22, %arg7, %29, %29, %23, %25, %23, %25, %24, %24, %25, %27, %30, %29) <{chan_name = @channel_1, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 7 : i32} : (!air.async.token, memref<1x1x64x512xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
            %48 = "air.channel.put"(%arg8, %45#1, %31, %31, %31, %31, %arg7, %31, %29, %29, %25, %23, %25, %23, %24, %24, %23, %30, %28, %29) <{chan_name = @channel_2, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 8 : i32} : (!air.async.token, memref<1x1x512x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
            %49 = "air.channel.put"(%arg8, %45#1, %31, %31, %31, %31, %arg7, %22, %29, %29, %25, %23, %25, %23, %24, %24, %23, %30, %28, %29) <{chan_name = @channel_3, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 9 : i32} : (!air.async.token, memref<1x1x512x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
            %50 = "air.execute"() ({
              "memref.dealloc"(%44#1) : (memref<1x1x64x512xi32, 1 : i32>) -> ()
              "air.execute_terminator"() : () -> ()
            }) : () -> !air.async.token
            %51 = "air.execute"() ({
              "memref.dealloc"(%45#1) : (memref<1x1x512x64xi32, 1 : i32>) -> ()
              "air.execute_terminator"() : () -> ()
            }) : () -> !air.async.token
            %52 = "air.wait_all"(%46, %47, %48, %49, %50, %51) : (!air.async.token, !air.async.token, !air.async.token, !air.async.token, !air.async.token, !air.async.token) -> !air.async.token
            "scf.yield"(%52) : (!air.async.token) -> ()
          }) : (index, index, index, !air.async.token) -> !air.async.token
          %38:2 = "air.execute"() ({
            %44 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x64x64xi32, 1 : i32>
            "air.execute_terminator"(%44) : (memref<1x1x64x64xi32, 1 : i32>) -> ()
          }) : () -> (!air.async.token, memref<1x1x64x64xi32, 1 : i32>)
          %39 = "air.wait_all"() : () -> !air.async.token
          %40 = "scf.parallel"(%31, %31, %32, %32, %29, %29, %38#0) <{operandSegmentSizes = array<i32: 2, 2, 2, 1>}> ({
          ^bb0(%arg7: index, %arg8: index):
            %44:2 = "air.execute"() ({
              %47 = "affine.apply"(%arg7) <{map = affine_map<()[s0] -> (s0 * 32)>}> : (index) -> index
              "air.execute_terminator"(%47) : (index) -> ()
            }) : () -> (!air.async.token, index)
            %45:2 = "air.execute"() ({
              %47 = "affine.apply"(%arg8) <{map = affine_map<()[s0] -> (s0 * 32)>}> : (index) -> index
              "air.execute_terminator"(%47) : (index) -> ()
            }) : () -> (!air.async.token, index)
            %46 = "air.channel.get"(%38#0, %45#0, %44#0, %arg7, %arg8, %38#1, %31, %31, %44#1, %45#1, %29, %29, %22, %22, %26, %26, %28, %29) <{chan_name = @channel_6, operandSegmentSizes = array<i32: 3, 2, 1, 4, 4, 4>}> {id = 10 : i32} : (!air.async.token, !air.async.token, !air.async.token, index, index, memref<1x1x64x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
            "scf.reduce"(%46) ({
            ^bb0(%arg9: !air.async.token, %arg10: !air.async.token):
              %47 = "air.wait_all"(%arg9, %arg10) : (!air.async.token, !air.async.token) -> !air.async.token
              "scf.reduce.return"(%47) : (!air.async.token) -> ()
            }) : (!air.async.token) -> ()
          }) : (index, index, index, index, index, index, !air.async.token) -> !air.async.token
          %41 = "air.herd"(%38#0, %32, %32) <{operandSegmentSizes = array<i32: 1, 2, 0>, sym_name = "herd_0"}> ({
          ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index):
            %44 = "arith.constant"() <{value = 128 : index}> : () -> index
            %45 = "arith.constant"() <{value = 16 : index}> : () -> index
            %46 = "arith.constant"() <{value = 1024 : index}> : () -> index
            %47 = "arith.constant"() <{value = 1 : index}> : () -> index
            %48 = "arith.constant"() <{value = 8 : index}> : () -> index
            %49 = "arith.constant"() <{value = 0 : i32}> : () -> i32
            %50 = "arith.constant"() <{value = 0 : index}> : () -> index
            %51 = "arith.constant"() <{value = 64 : index}> : () -> index
            %52 = "arith.constant"() <{value = 4 : index}> : () -> index
            %53:2 = "air.execute"() ({
              %58 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x8x4x4xi32, 2 : i32>
              "air.execute_terminator"(%58) : (memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
            }) : () -> (!air.async.token, memref<1x1x8x8x4x4xi32, 2 : i32>)
            %54 = "air.execute"(%53#0) ({
              "linalg.fill"(%49, %53#1) <{operandSegmentSizes = array<i32: 1, 1>}> ({
              ^bb0(%arg11: i32, %arg12: i32):
                "linalg.yield"(%arg11) : (i32) -> ()
              }) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
              "air.execute_terminator"() : () -> ()
            }) : (!air.async.token) -> !air.async.token
            %55 = "scf.for"(%50, %51, %52, %54) ({
            ^bb0(%arg11: index, %arg12: !air.async.token):
              %58:2 = "air.execute"() ({
                %65 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x4x8x4x8xi32, 2 : i32>
                "air.execute_terminator"(%65) : (memref<1x1x4x8x4x8xi32, 2 : i32>) -> ()
              }) : () -> (!air.async.token, memref<1x1x4x8x4x8xi32, 2 : i32>)
              %59 = "affine.if"(%arg7, %arg8) ({
                %65 = "air.channel.get"(%58#0, %arg12, %arg7, %arg8, %58#1) <{chan_name = @channel_0, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 11 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x4x8x4x8xi32, 2 : i32>) -> !air.async.token
                "affine.yield"(%65) : (!air.async.token) -> ()
              }, {
                %65 = "air.channel.get"(%58#0, %arg12, %arg7, %arg8, %58#1) <{chan_name = @channel_1, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 12 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x4x8x4x8xi32, 2 : i32>) -> !air.async.token
                "affine.yield"(%65) : (!air.async.token) -> ()
              }) {condition = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>} : (index, index) -> !air.async.token
              %60:2 = "air.execute"() ({
                %65 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4x8x4xi32, 2 : i32>
                "air.execute_terminator"(%65) : (memref<1x1x8x4x8x4xi32, 2 : i32>) -> ()
              }) : () -> (!air.async.token, memref<1x1x8x4x8x4xi32, 2 : i32>)
              %61 = "affine.if"(%arg7, %arg8) ({
                %65 = "air.channel.get"(%60#0, %arg12, %arg7, %arg8, %60#1) <{chan_name = @channel_2, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 13 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x8x4x8x4xi32, 2 : i32>) -> !air.async.token
                "affine.yield"(%65) : (!air.async.token) -> ()
              }, {
                %65 = "air.channel.get"(%60#0, %arg12, %arg7, %arg8, %60#1) <{chan_name = @channel_3, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 14 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x8x4x8x4xi32, 2 : i32>) -> !air.async.token
                "affine.yield"(%65) : (!air.async.token) -> ()
              }) {condition = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>} : (index, index) -> !air.async.token
              %62 = "air.execute"(%61, %59) ({
                "linalg.generic"(%58#1, %60#1, %53#1) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
                ^bb0(%arg13: i32, %arg14: i32, %arg15: i32):
                  %65 = "arith.muli"(%arg13, %arg14) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %66 = "arith.addi"(%arg15, %65) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "linalg.yield"(%66) : (i32) -> ()
                }) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
                "air.execute_terminator"() : () -> ()
              }) : (!air.async.token, !air.async.token) -> !air.async.token
              %63 = "air.execute"(%62) ({
                "memref.dealloc"(%58#1) : (memref<1x1x4x8x4x8xi32, 2 : i32>) -> ()
                "air.execute_terminator"() : () -> ()
              }) : (!air.async.token) -> !air.async.token
              %64 = "air.execute"(%62) ({
                "memref.dealloc"(%60#1) : (memref<1x1x8x4x8x4xi32, 2 : i32>) -> ()
                "air.execute_terminator"() : () -> ()
              }) : (!air.async.token) -> !air.async.token
              "scf.yield"(%62) : (!air.async.token) -> ()
            }) : (index, index, index, !air.async.token) -> !air.async.token
            %56 = "air.channel.put"(%55, %arg7, %arg8, %53#1, %50, %50, %50, %50, %50, %50, %47, %47, %48, %52, %48, %52, %46, %46, %45, %52, %44, %47) <{chan_name = @channel_6, operandSegmentSizes = array<i32: 1, 2, 1, 6, 6, 6>}> {id = 15 : i32} : (!air.async.token, index, index, memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
            %57 = "air.execute"(%56) ({
              "memref.dealloc"(%53#1) : (memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
              "air.execute_terminator"() : () -> ()
            }) : (!air.async.token) -> !air.async.token
            "air.herd_terminator"() : () -> ()
          }) {id = 3 : i32} : (!air.async.token, index, index) -> !air.async.token
          %42 = "air.channel.put"(%41, %34, %33, %38#1, %31, %31, %31, %31, %29, %29, %28, %28, %26, %26, %28, %29) <{chan_name = @channel_7, operandSegmentSizes = array<i32: 3, 0, 1, 4, 4, 4>}> {id = 16 : i32} : (!air.async.token, !air.async.token, !air.async.token, memref<1x1x64x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
          %43 = "air.execute"(%42) ({
            "memref.dealloc"(%38#1) : (memref<1x1x64x64xi32, 1 : i32>) -> ()
            "air.execute_terminator"() : () -> ()
          }) : (!air.async.token) -> !air.async.token
          "air.segment_terminator"() : () -> ()
        }) {id = 2 : i32} : () -> !air.async.token
        "air.launch_terminator"() : () -> ()
      }) {id = 1 : i32} : (!air.async.token, !air.async.token, !air.async.token, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32>) -> !air.async.token
      "func.return"() : () -> ()
    }) : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
}) {sym_name = "amdaie_xclbin_fb", sym_visibility = "public", target = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>} : () -> ()

## successful_log_without_submodule.log
// -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before TypePropagation (iree-codegen-type-propagation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before AMDAIELoweringStrategy (iree-amdaie-lowering-strategy) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before AMDAIELowerExecutableTarget (iree-amdaie-lower-executable-target) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
      %5 = tensor.empty() : tensor<2048x2048xi32>
      %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
      return
    }
  }
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
  %8 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %9 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%9 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
      %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
      %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<1x1x64x512xi32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %9 = tensor.empty() : tensor<1x1x64x512xi32>
    %10 = tensor.empty() : tensor<1x1x512x64xi32>
    %pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %10 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %11 = tensor.empty() : tensor<1x1x64x64xi32>
    %pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
    %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_3 : tensor<1x1x64x64xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_4: i32, %out: i32):
      %13 = arith.muli %in, %in_4 : i32
      %14 = arith.addi %out, %13 : i32
      linalg.yield %14 : i32
    } -> tensor<1x1x64x64xi32>
    %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<1x1x64x512xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %10 = tensor.empty() : tensor<1x1x64x512xi32>
    %11 = tensor.empty() : tensor<1x1x512x64xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %13 = tensor.empty() : tensor<1x1x64x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_6: i32, %out: i32):
      %16 = arith.muli %in, %in_6 : i32
      %17 = arith.addi %out, %16 : i32
      linalg.yield %17 : i32
    } -> tensor<1x1x64x64xi32>
    %unpack = tensor.unpack %15 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
    %8 = tensor.empty() : tensor<1x1x64x512xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %10 = tensor.empty() : tensor<1x1x64x512xi32>
    %11 = tensor.empty() : tensor<1x1x512x64xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %13 = tensor.empty() : tensor<1x1x64x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
    ^bb0(%in: i32, %in_6: i32, %out: i32):
      %17 = arith.muli %in, %in_6 : i32
      %18 = arith.addi %out, %17 : i32
      linalg.yield %18 : i32
    } -> tensor<1x1x64x64xi32>
    %16 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %pack_5) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_6 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_7 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_6, %extracted_slice_7 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_8 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_9: i32, %out: i32):
        %18 = arith.muli %in, %in_9 : i32
        %19 = arith.addi %out, %18 : i32
        linalg.yield %19 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %17 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %16 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %9 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %10 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
      %12 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %11) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %14 = arith.muli %in, %in_8 : i32
          %15 = arith.addi %out, %14 : i32
          linalg.yield %15 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
      %11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %13 = arith.muli %in, %in_8 : i32
          %14 = arith.addi %out, %13 : i32
          linalg.yield %14 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEFuseFillIntoForall (iree-amdaie-fuse-fill-into-forall) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
    %11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_8: i32, %out: i32):
        %13 = arith.muli %in, %in_8 : i32
        %14 = arith.addi %out, %13 : i32
        linalg.yield %14 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
    %11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%12 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_9: i32, %out: i32):
        %14 = arith.muli %in, %in_9 : i32
        %15 = arith.addi %out, %14 : i32
        linalg.yield %15 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %13 = arith.muli %in, %in_8 : i32
          %14 = arith.addi %out, %13 : i32
          linalg.yield %14 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
        %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_8: i32, %out: i32):
          %13 = arith.muli %in, %in_8 : i32
          %14 = arith.addi %out, %13 : i32
          linalg.yield %14 : i32
        } -> tensor<1x1x32x32xi32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_8: i32, %out: i32):
        %13 = arith.muli %in, %in_8 : i32
        %14 = arith.addi %out, %13 : i32
        linalg.yield %14 : i32
      } -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
      %13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %pack_10 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
      %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_10 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_12: i32, %out: i32):
        %19 = arith.muli %in, %in_12 : i32
        %20 = arith.addi %out, %19 : i32
        linalg.yield %20 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %unpack_11 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
      %13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_13: i32, %out: i32):
        %20 = arith.muli %in, %in_13 : i32
        %21 = arith.addi %out, %20 : i32
        linalg.yield %21 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %unpack_12 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
      %13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
      %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
      ^bb0(%in: i32, %in_14: i32, %out: i32):
        %21 = arith.muli %in, %in_14 : i32
        %22 = arith.addi %out, %21 : i32
        linalg.yield %22 : i32
      } -> tensor<1x1x8x8x4x4xi32>
      %c0_12 = arith.constant 0 : index
      %c64 = arith.constant 64 : index
      %c4 = arith.constant 4 : index
      %20 = scf.for %arg6 = %c0_12 to %c64 step %c4 iter_args(%arg7 = %pack_11) -> (tensor<1x1x8x8x4x4xi32>) {
        %extracted_slice_14 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_15 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %extracted_slice_16 = tensor.extract_slice %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
        %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_14, %extracted_slice_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_17: i32, %out: i32):
          %22 = arith.muli %in, %in_17 : i32
          %23 = arith.addi %out, %22 : i32
          linalg.yield %23 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        %inserted_slice = tensor.insert_slice %21 into %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<1x1x8x8x4x4xi32>
        scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_13 = tensor.unpack %20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
        %12 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
        %13 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
        %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %16 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %15) -> (tensor<1x1x8x8x4x4xi32>) {
          %extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_14: i32, %out: i32):
            %18 = arith.muli %in, %in_14 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_11 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
        %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
        %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
          %extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_14: i32, %out: i32):
            %17 = arith.muli %in, %in_14 : i32
            %18 = arith.addi %out, %17 : i32
            linalg.yield %18 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %16 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEFusePackIntoForLoop (iree-amdaie-fuse-pack-into-for) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %17 = arith.muli %in, %in_14 : i32
          %18 = arith.addi %out, %17 : i32
          linalg.yield %18 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.yield %16 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
      %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_12 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_13 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_13 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_15 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_16 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_17 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %pack_18 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %extracted_slice_19 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_14, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_20: i32, %out: i32):
          %19 = arith.muli %in, %in_20 : i32
          %20 = arith.addi %out, %19 : i32
          linalg.yield %20 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.yield %18 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
          %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %19 = arith.muli %in, %in_16 : i32
            %20 = arith.addi %out, %19 : i32
            linalg.yield %20 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %18 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
        %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
          %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %19 = arith.muli %in, %in_16 : i32
            %20 = arith.addi %out, %19 : i32
            linalg.yield %20 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          scf.yield %18 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_16: i32, %out: i32):
          %18 = arith.muli %in, %in_16 : i32
          %19 = arith.addi %out, %18 : i32
          linalg.yield %19 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        scf.yield %17 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before AMDAIELowerToUKernels (iree-amdaie-lower-to-ukernels) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
  %5 = tensor.empty() : tensor<2048x2048xi32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
    %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
    %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
    %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
    %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
      %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
      %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
      %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
      %11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
      %12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      %13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
      %14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
      %15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
        %16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
        %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
        %alloc_12 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        %17 = bufferization.to_tensor %alloc_12 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
        %pack_13 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
        %extracted_slice_14 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
        %extracted_slice_15 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
        %alloc_16 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        %18 = bufferization.to_tensor %alloc_16 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
        %pack_17 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_18: i32, %out: i32):
          %20 = arith.muli %in, %in_18 : i32
          %21 = arith.addi %out, %20 : i32
          linalg.yield %21 : i32
        } -> tensor<1x1x8x8x4x4xi32>
        memref.dealloc %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_16 : memref<1x1x8x4x8x4xi32, 2 : i32>
        scf.yield %19 : tensor<1x1x8x8x4x4xi32>
      }
      %unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
  return
}

// -----// IR Dump Before EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = tensor.empty() : tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
          %14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %18 = arith.muli %in, %in_16 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
          %14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %18 = arith.muli %in, %in_16 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
    %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
    %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
      %extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
      %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
      %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      %7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
      %pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      %8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
      %pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
      %alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
      %10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
        %extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
        %extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
        %extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        %11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
        %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
        %13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
          %14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
          %extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
          %pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
          %extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
          %alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
          %pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
          %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_16: i32, %out: i32):
            %18 = arith.muli %in, %in_16 : i32
            %19 = arith.addi %out, %18 : i32
            linalg.yield %19 : i32
          } -> tensor<1x1x8x8x4x4xi32>
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %17 : tensor<1x1x8x8x4x4xi32>
        }
        %unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
    return
  }
}

// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        %3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_14: i32, %out: i32):
            %5 = arith.muli %in, %in_14 : i32
            %6 = arith.addi %out, %5 : i32
            linalg.yield %6 : i32
          }
          memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
          scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
        %subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
        ^bb0(%in: i32, %out: i32):
          linalg.yield %in : i32
        }
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      %subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      %3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
        %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %5 = arith.muli %in, %in_14 : i32
          %6 = arith.addi %out, %5 : i32
          linalg.yield %6 : i32
        }
        memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
        scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      %subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    %subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
  ^bb0(%in: i32, %out: i32):
    linalg.yield %in : i32
  }
  return
}

// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      scf.for %arg4 = %c0 to %c64 step %c4 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_10 = memref.subview %subview_5[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_12 = memref.subview %subview_6[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_14: i32, %out: i32):
          %4 = arith.muli %in, %in_14 : i32
          %5 = arith.addi %out, %4 : i32
          linalg.yield %5 : i32
        }
        memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %alloc_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
      %subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    %subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  return
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      scf.for %arg4 = %c0 to %c64 step %c4 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_12: i32, %out: i32):
          %4 = arith.muli %in, %in_12 : i32
          %5 = arith.addi %out, %4 : i32
          linalg.yield %5 : i32
        }
        memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
      ^bb0(%in: i32, %out: i32):
        linalg.yield %in : i32
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
    ^bb0(%in: i32, %out: i32):
      linalg.yield %in : i32
    }
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  return
}

// -----// IR Dump Before CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
  %c4 = arith.constant 4 : index
  %c64 = arith.constant 64 : index
  %c0_i32 = arith.constant 0 : i32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
    %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
    iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
    %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
    iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
    %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
    scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
      %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
      %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
      %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
      %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
      linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
      scf.for %arg4 = %c0 to %c64 step %c4 {
        %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
        %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
        iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
        %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
        iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
        ^bb0(%in: i32, %in_12: i32, %out: i32):
          %4 = arith.muli %in, %in_12 : i32
          %5 = arith.addi %out, %4 : i32
          linalg.yield %5 : i32
        }
        memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
        memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
      }
      iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
      memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
    memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
    memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
  } {mapping = [#gpu.block<y>, #gpu.block<x>]}
  return
}

// -----// IR Dump Before LowerUKernelOpsToCalls (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_12: i32, %out: i32):
            %4 = arith.muli %in, %in_12 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
        %subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
        %subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_12: i32, %out: i32):
            %4 = arith.muli %in, %in_12 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c4 = arith.constant 4 : index
      %c64 = arith.constant 64 : index
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
        %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
        %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
        %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
          %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg4 = %c0 to %c64 step %c4 {
            %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
            %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_10: i32, %out: i32):
              %5 = arith.muli %in, %in_10 : i32
              %6 = arith.addi %out, %5 : i32
              linalg.yield %6 : i32
            }
            memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
          memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {mapping = [#gpu.block<y>, #gpu.block<x>]}
        iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      return
    }
  }
}

// -----// IR Dump Before CSE (cse) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c4 = arith.constant 4 : index
      %c64 = arith.constant 64 : index
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
        %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
        %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
        %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
          %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg4 = %c0 to %c64 step %c4 {
            %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
            %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_10: i32, %out: i32):
              %5 = arith.muli %in, %in_10 : i32
              %6 = arith.addi %out, %5 : i32
              linalg.yield %6 : i32
            }
            memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
          memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {mapping = [#gpu.block<y>, #gpu.block<x>]}
        iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      return
    }
  }
}

// -----// IR Dump Before AMDAIELowerWorkgroupCount (iree-amdaie-lower-workgroup-count) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
  hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
      %c4 = arith.constant 4 : index
      %c64 = arith.constant 64 : index
      %c0_i32 = arith.constant 0 : i32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
      scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
        %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
        %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
        %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
          %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg4 = %c0 to %c64 step %c4 {
            %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
            %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
            %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_10: i32, %out: i32):
              %4 = arith.muli %in, %in_10 : i32
              %5 = arith.addi %out, %4 : i32
              linalg.yield %5 : i32
            }
            memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
          memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {mapping = [#gpu.block<y>, #gpu.block<x>]}
        iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      return
    }
  }
}

// -----// IR Dump Before EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before AMDAIEBridgeToAIR (iree-amdaie-bridge-to-air) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
      } {mapping = [#gpu.block<y>, #gpu.block<x>]}
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    return
  }
}

// -----// IR Dump Before AMDAIEDecomposeLinalgExtPackUnPackToAIR (iree-amdaie-decompose-pack-unpack-to-air) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
        %subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
          %subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
          %subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_10: i32, %out: i32):
            %4 = arith.muli %in, %in_10 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.reduce
      }
      iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      scf.reduce
    }
    return
  }
}

// -----// IR Dump Before ParallelToHerd (air-par-to-herd) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
        %subview_5 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg4 = %c0 to %c64 step %c4 {
          %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
          %subview_8 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %expand_shape = memref.expand_shape %subview_8 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
          %transpose_10 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_9[] [] [], %transpose_10[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
          %subview_11 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %expand_shape_13 = memref.expand_shape %subview_11 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
          %transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_12[] [] [], %transpose_14[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_15: i32, %out: i32):
            %4 = arith.muli %in, %in_15 : i32
            %5 = arith.addi %out, %4 : i32
            linalg.yield %5 : i32
          }
          memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        %transpose_7 = memref.transpose %alloc_6 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
        air.dma_memcpy_nd (%subview_5[] [] [], %transpose_7[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
        memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
        scf.reduce
      }
      %subview_4 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
      %transpose = memref.transpose %subview_4 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
      air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      scf.reduce
    }
    return
  }
}

// -----// IR Dump Before ParallelToLaunch (air-par-to-launch) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
      %subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
      %subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
      %subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
      %alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
      air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
      %alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
      %c2 = arith.constant 2 : index
      %c0_4 = arith.constant 0 : index
      %c1 = arith.constant 1 : index
      %c2_5 = arith.constant 2 : index
      %c0_6 = arith.constant 0 : index
      %c1_7 = arith.constant 1 : index
      %c2_8 = arith.constant 2 : index
      %c2_9 = arith.constant 2 : index
      air.herd @herd_0  tile (%arg2, %arg3) in (%arg4=%c2_8, %arg5=%c2_9) args(%arg6=%alloc_3, %arg7=%alloc, %arg8=%alloc_2) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
        %c0_i32_11 = arith.constant 0 : i32
        %c0_12 = arith.constant 0 : index
        %c64_13 = arith.constant 64 : index
        %c4_14 = arith.constant 4 : index
        %3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
        %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
        %subview_15 = memref.subview %arg6[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
        %alloc_16 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
        linalg.fill ins(%c0_i32_11 : i32) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>)
        scf.for %arg9 = %c0_12 to %c64_13 step %c4_14 {
          %5 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
          %subview_18 = memref.subview %arg7[0, 0, %3, %5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
          %alloc_19 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
          %expand_shape = memref.expand_shape %subview_18 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
          %transpose_20 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_19[] [] [], %transpose_20[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
          %subview_21 = memref.subview %arg8[0, 0, %5, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
          %alloc_22 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
          %expand_shape_23 = memref.expand_shape %subview_21 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
          %transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
          air.dma_memcpy_nd (%alloc_22[] [] [], %transpose_24[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
          linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_19, %alloc_22 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
          ^bb0(%in: i32, %in_25: i32, %out: i32):
            %6 = arith.muli %in, %in_25 : i32
            %7 = arith.addi %out, %6 : i32
            linalg.yield %7 : i32
          }
          memref.dealloc %alloc_19 : memref<1x1x4x8x4x8xi32, 2 : i32>
          memref.dealloc %alloc_22 : memref<1x1x8x4x8x4xi32, 2 : i32>
        }
        %transpose_17 = memref.transpose %alloc_16 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
        air.dma_memcpy_nd (%subview_15[] [] [], %transpose_17[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
        memref.dealloc %alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>
        air.herd_terminator
      }
      %subview_10 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
      %transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
      air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
      memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
      memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
      memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
      scf.reduce
    }
    return
  }
}

// -----// IR Dump Before CopyToDma (air-copy-to-dma) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c2048 = arith.constant 2048 : index
    %c4 = arith.constant 4 : index
    %c64 = arith.constant 64 : index
    %c0_i32 = arith.constant 0 : i32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    %c32_0 = arith.constant 32 : index
    %c0_1 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c32_2 = arith.constant 32 : index
    %c0_3 = arith.constant 0 : index
    %c1_4 = arith.constant 1 : index
    %c32_5 = arith.constant 32 : index
    %c32_6 = arith.constant 32 : index
    air.launch (%arg0, %arg1) in (%arg2=%c32_5, %arg3=%c32_6) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg7)
        %4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg8)
        %subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
        %subview_7 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
        %subview_8 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
        %alloc_9 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc_9[] [] [], %subview_7[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
        %alloc_10 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        %c2 = arith.constant 2 : index
        %c0_11 = arith.constant 0 : index
        %c1_12 = arith.constant 1 : index
        %c2_13 = arith.constant 2 : index
        %c0_14 = arith.constant 0 : index
        %c1_15 = arith.constant 1 : index
        %c2_16 = arith.constant 2 : index
        %c2_17 = arith.constant 2 : index
        air.herd @herd_0  tile (%arg14, %arg15) in (%arg16=%c2_16, %arg17=%c2_17) args(%arg18=%alloc_10, %arg19=%alloc, %arg20=%alloc_9) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c0_i32_19 = arith.constant 0 : i32
          %c0_20 = arith.constant 0 : index
          %c64_21 = arith.constant 64 : index
          %c4_22 = arith.constant 4 : index
          %5 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg14)
          %6 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg15)
          %subview_23 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_24 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32_19 : i32) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg21 = %c0_20 to %c64_21 step %c4_22 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
            %subview_26 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_27 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            %expand_shape = memref.expand_shape %subview_26 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
            %transpose_28 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
            air.dma_memcpy_nd (%alloc_27[] [] [], %transpose_28[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
            %subview_29 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_30 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            %expand_shape_31 = memref.expand_shape %subview_29 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
            %transpose_32 = memref.transpose %expand_shape_31 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
            air.dma_memcpy_nd (%alloc_30[] [] [], %transpose_32[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_27, %alloc_30 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_33: i32, %out: i32):
              %8 = arith.muli %in, %in_33 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_27 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_30 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          %transpose_25 = memref.transpose %alloc_24 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
          air.dma_memcpy_nd (%subview_23[] [] [], %transpose_25[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
          memref.dealloc %alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        %subview_18 = memref.subview %alloc_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
        %transpose = memref.transpose %subview_18 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
        air.dma_memcpy_nd (%subview_8[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_9 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_10 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %c2 = arith.constant 2 : index
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
        %subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
        %subview_0 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
        %subview_1 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        %c0_2 = arith.constant 0 : index
        %c512 = arith.constant 512 : index
        %c1 = arith.constant 1 : index
        %c64 = arith.constant 64 : index
        %c512_3 = arith.constant 512 : index
        air.dma_memcpy_nd (%alloc[] [] [], %arg11[%3, %c0_2] [%c64, %c512_3] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %alloc_4 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        %c0_5 = arith.constant 0 : index
        %c2048 = arith.constant 2048 : index
        %c1_6 = arith.constant 1 : index
        %c512_7 = arith.constant 512 : index
        %c64_8 = arith.constant 64 : index
        air.dma_memcpy_nd (%alloc_4[] [] [], %arg12[%c0_5, %4] [%c512_7, %c64_8] [%c2048, %c1_6]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %alloc_9 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        air.herd @herd_0  tile (%arg14, %arg15) in (%arg16=%c2, %arg17=%c2) args(%arg18=%alloc_9, %arg19=%alloc, %arg20=%alloc_4) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg14]
          %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg15]
          %subview_28 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
          %alloc_29 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg21 = %c0_26 to %c64_27 step %c4 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
            %subview_55 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
            %alloc_56 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            %expand_shape = memref.expand_shape %subview_55 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
            %transpose_57 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
            %c0_58 = arith.constant 0 : index
            %c0_59 = arith.constant 0 : index
            %c32768 = arith.constant 32768 : index
            %c32768_60 = arith.constant 32768 : index
            %c8_61 = arith.constant 8 : index
            %c2048_62 = arith.constant 2048 : index
            %c512_63 = arith.constant 512 : index
            %c1_64 = arith.constant 1 : index
            %c1_65 = arith.constant 1 : index
            %c1_66 = arith.constant 1 : index
            %c4_67 = arith.constant 4 : index
            %c8_68 = arith.constant 8 : index
            %c4_69 = arith.constant 4 : index
            %c8_70 = arith.constant 8 : index
            %c0_71 = arith.constant 0 : index
            %c0_72 = arith.constant 0 : index
            air.dma_memcpy_nd (%alloc_56[] [] [], %arg19[%c0_72, %c0_71, %c0_58, %c0_59, %5, %7] [%c1_65, %c1_66, %c4_67, %c8_68, %c4_69, %c8_70] [%c32768, %c32768_60, %c8_61, %c2048_62, %c512_63, %c1_64]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %subview_73 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
            %alloc_74 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            %expand_shape_75 = memref.expand_shape %subview_73 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
            %transpose_76 = memref.transpose %expand_shape_75 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
            %c0_77 = arith.constant 0 : index
            %c0_78 = arith.constant 0 : index
            %c32768_79 = arith.constant 32768 : index
            %c32768_80 = arith.constant 32768 : index
            %c4_81 = arith.constant 4 : index
            %c512_82 = arith.constant 512 : index
            %c64_83 = arith.constant 64 : index
            %c1_84 = arith.constant 1 : index
            %c1_85 = arith.constant 1 : index
            %c1_86 = arith.constant 1 : index
            %c8_87 = arith.constant 8 : index
            %c4_88 = arith.constant 4 : index
            %c8_89 = arith.constant 8 : index
            %c4_90 = arith.constant 4 : index
            %c0_91 = arith.constant 0 : index
            %c0_92 = arith.constant 0 : index
            air.dma_memcpy_nd (%alloc_74[] [] [], %arg20[%c0_92, %c0_91, %c0_77, %c0_78, %7, %6] [%c1_85, %c1_86, %c8_87, %c4_88, %c8_89, %c4_90] [%c32768_79, %c32768_80, %c4_81, %c512_82, %c64_83, %c1_84]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_56, %alloc_74 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_93: i32, %out: i32):
              %8 = arith.muli %in, %in_93 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_56 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_74 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          %transpose_30 = memref.transpose %alloc_29 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
          %c0_31 = arith.constant 0 : index
          %c0_32 = arith.constant 0 : index
          %c1024 = arith.constant 1024 : index
          %c1024_33 = arith.constant 1024 : index
          %c16 = arith.constant 16 : index
          %c4_34 = arith.constant 4 : index
          %c128 = arith.constant 128 : index
          %c1_35 = arith.constant 1 : index
          %c1_36 = arith.constant 1 : index
          %c1_37 = arith.constant 1 : index
          %c8 = arith.constant 8 : index
          %c4_38 = arith.constant 4 : index
          %c8_39 = arith.constant 8 : index
          %c4_40 = arith.constant 4 : index
          %c4096_41 = arith.constant 4096 : index
          %c4096_42 = arith.constant 4096 : index
          %c64_43 = arith.constant 64 : index
          %c1_44 = arith.constant 1 : index
          %c1_45 = arith.constant 1 : index
          %c1_46 = arith.constant 1 : index
          %c32_47 = arith.constant 32 : index
          %c32_48 = arith.constant 32 : index
          %c0_49 = arith.constant 0 : index
          %c0_50 = arith.constant 0 : index
          %c0_51 = arith.constant 0 : index
          %c0_52 = arith.constant 0 : index
          %c0_53 = arith.constant 0 : index
          %c0_54 = arith.constant 0 : index
          air.dma_memcpy_nd (%arg18[%c0_31, %c0_32, %5, %6] [%c1_45, %c1_46, %c32_47, %c32_48] [%c4096_41, %c4096_42, %c64_43, %c1_44], %alloc_29[%c0_54, %c0_53, %c0_52, %c0_51, %c0_50, %c0_49] [%c1_36, %c1_37, %c8, %c4_38, %c8_39, %c4_40] [%c1024, %c1024_33, %c16, %c4_34, %c128, %c1_35]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          memref.dealloc %alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        %subview_10 = memref.subview %alloc_9[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
        %transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
        %c0_11 = arith.constant 0 : index
        %c0_12 = arith.constant 0 : index
        %c0_13 = arith.constant 0 : index
        %c0_14 = arith.constant 0 : index
        %c64_15 = arith.constant 64 : index
        %c1_16 = arith.constant 1 : index
        %c64_17 = arith.constant 64 : index
        %c64_18 = arith.constant 64 : index
        %c2048_19 = arith.constant 2048 : index
        %c1_20 = arith.constant 1 : index
        %c64_21 = arith.constant 64 : index
        %c64_22 = arith.constant 64 : index
        %c1_23 = arith.constant 1 : index
        %c1_24 = arith.constant 1 : index
        %c4096 = arith.constant 4096 : index
        %c4096_25 = arith.constant 4096 : index
        air.dma_memcpy_nd (%arg13[%3, %4] [%c64_21, %c64_22] [%c2048_19, %c1_20], %alloc_9[%c0_11, %c0_12, %c0_13, %c0_14] [%c1_24, %c1_23, %c64_17, %c64_18] [%c4096_25, %c4096, %c64_15, %c1_16]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_4 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_9 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_0 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        air.herd @herd_0  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c32_3 = arith.constant 32 : index
          %c4096_4 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_5 = arith.constant 1 : index
          %c512_6 = arith.constant 512 : index
          %c2048_7 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_8 = arith.constant 0 : index
          %c64_9 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
          %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
          %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
            %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_13: i32, %out: i32):
              %8 = arith.muli %in, %in_13 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRDependency (air-dependency) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
    memref.assume_alignment %0, 64 : memref<2048x512xi32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
    memref.assume_alignment %1, 64 : memref<512x2048xi32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
    memref.assume_alignment %2, 64 : memref<2048x2048xi32>
    air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
      air.segment @segment_0  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_0 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
        %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
        air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
        air.herd @herd_0  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
          %c32_3 = arith.constant 32 : index
          %c4096_4 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_5 = arith.constant 1 : index
          %c512_6 = arith.constant 512 : index
          %c2048_7 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_8 = arith.constant 0 : index
          %c64_9 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
          %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
          %alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
            %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
            %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
            air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
            ^bb0(%in: i32, %in_13: i32, %out: i32):
              %8 = arith.muli %in, %in_13 : i32
              %9 = arith.addi %out, %8 : i32
              linalg.yield %9 : i32
            }
            memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
            memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
          }
          air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.herd_terminator
        }
        air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
        memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
        memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRDependencyScheduleOpt (air-dependency-schedule-opt) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %1 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_7 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_8, %results_9 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %6 : index
        } {id = 7 : i32}
        %async_token_10, %results_11 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %6 : index
        } {id = 8 : i32}
        %async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %4 = air.herd @herd_0 async [%2, %3, %async_token_16]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_21 = arith.constant 32 : index
          %c4096_22 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_23 = arith.constant 1 : index
          %c512_24 = arith.constant 512 : index
          %c2048_25 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %async_token_28, %results_29 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %9 : index
          } {id = 12 : i32}
          %async_token_30, %results_31 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %9 : index
          } {id = 13 : i32}
          %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_34 = air.execute [%async_token_32] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34]  {id = 2 : i32}
          %7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
            %c0_36 = arith.constant 0 : index
            %c1_37 = arith.constant 1 : index
            %c4_38 = arith.constant 4 : index
            %c8_39 = arith.constant 8 : index
            %c32768_40 = arith.constant 32768 : index
            %c2048_41 = arith.constant 2048 : index
            %c512_42 = arith.constant 512 : index
            %c64_43 = arith.constant 64 : index
            %async_token_44, %results_45 = air.execute [%arg20] -> (index) {
              %12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %12 : index
            } {id = 16 : i32}
            %async_token_46, %results_47 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %9 = air.dma_memcpy_nd async [%async_token_46, %async_token_44, %arg20] (%results_47[] [] [], %arg17[%c0_36, %c0_36, %c0_36, %c0_36, %results_29, %results_45] [%c1_37, %c1_37, %c4_38, %c8_39, %c4_38, %c8_39] [%c32768_40, %c32768_40, %c8_39, %c2048_41, %c512_42, %c1_37]) {id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %async_token_48, %results_49 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %10 = air.dma_memcpy_nd async [%async_token_48, %async_token_44, %arg20] (%results_49[] [] [], %arg18[%c0_36, %c0_36, %c0_36, %c0_36, %results_45, %results_31] [%c1_37, %c1_37, %c8_39, %c4_38, %c8_39, %c4_38] [%c32768_40, %c32768_40, %c4_38, %c512_42, %c64_43, %c1_37]) {id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            %async_token_50 = air.execute [%10, %9, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_47, %results_49 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_53: i32, %out: i32):
                %12 = arith.muli %in, %in_53 : i32
                %13 = arith.addi %out, %12 : i32
                linalg.yield %13 : i32
              }
            } {id = 19 : i32}
            %async_token_51 = air.execute [%async_token_50] {
              memref.dealloc %results_47 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_52 = air.execute [%async_token_50] {
              memref.dealloc %results_49 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %11 = air.wait_all async [%arg20, %async_token_50]  {id = 1 : i32}
            scf.yield %11 : !air.async.token
          }
          %8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_35 = air.execute [%8] {
            memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %5 = air.dma_memcpy_nd async [%4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        %async_token_18 = air.execute [%4] {
          memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_19 = air.execute [%4] {
          memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_20 = air.execute [%5] {
          memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRSpecializeDmaBroadcast (air-specialize-dma-broadcast) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %1 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_7 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_8, %results_9 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %6 : index
        } {id = 7 : i32}
        %async_token_10, %results_11 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %6 : index
        } {id = 8 : i32}
        %async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %4 = air.herd @herd_0 async [%2, %3, %async_token_16]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_21 = arith.constant 32 : index
          %c4096_22 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_23 = arith.constant 1 : index
          %c512_24 = arith.constant 512 : index
          %c2048_25 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %async_token_28, %results_29 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %9 : index
          } {id = 12 : i32}
          %async_token_30, %results_31 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %9 : index
          } {id = 13 : i32}
          %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_34 = air.execute [%async_token_32] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34]  {id = 2 : i32}
          %7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
            %async_token_36, %results_37 = air.execute [%arg20] -> (index) {
              %12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %12 : index
            } {id = 16 : i32}
            %async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %9 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_26, %c0_26, %c0_26, %c0_26, %results_29, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 - s0 == 0, d1 >= 0, -d1 + 1 >= 0, s0 >= 0, -s0 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
            %async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %10 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_26, %c0_26, %c0_26, %c0_26, %results_37, %results_31] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 >= 0, -d0 + 1 >= 0, d1 - s0 == 0, s0 >= 0, -s0 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
            %async_token_42 = air.execute [%10, %9, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_45: i32, %out: i32):
                %12 = arith.muli %in, %in_45 : i32
                %13 = arith.addi %out, %12 : i32
                linalg.yield %13 : i32
              }
            } {id = 19 : i32}
            %async_token_43 = air.execute [%async_token_42] {
              memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_44 = air.execute [%async_token_42] {
              memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %11 = air.wait_all async [%arg20, %async_token_42]  {id = 1 : i32}
            scf.yield %11 : !air.async.token
          }
          %8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_35 = air.execute [%8] {
            memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %5 = air.dma_memcpy_nd async [%4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        %async_token_18 = air.execute [%4] {
          memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_19 = air.execute [%4] {
          memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_20 = air.execute [%5] {
          memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before DmaToChannel (air-dma-to-channel) //----- //
module {
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %1 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048 = arith.constant 2048 : index
        %c64 = arith.constant 64 : index
        %c1 = arith.constant 1 : index
        %c512 = arith.constant 512 : index
        %c0_7 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_8, %results_9 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %6 : index
        } {id = 7 : i32}
        %async_token_10, %results_11 = air.execute -> (index) {
          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %6 : index
        } {id = 8 : i32}
        %async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
        %async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
        %async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %4 = air.herd @herd_0 async [%2, %3, %async_token_16]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_21 = arith.constant 32 : index
          %c4096_22 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_23 = arith.constant 1 : index
          %c512_24 = arith.constant 512 : index
          %c2048_25 = arith.constant 2048 : index
          %c8 = arith.constant 8 : index
          %c32768 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_26 = arith.constant 0 : index
          %c64_27 = arith.constant 64 : index
          %c4 = arith.constant 4 : index
          %async_token_28, %results_29 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %9 : index
          } {id = 12 : i32}
          %async_token_30, %results_31 = air.execute -> (index) {
            %9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %9 : index
          } {id = 13 : i32}
          %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_34 = air.execute [%async_token_32] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34]  {id = 2 : i32}
          %7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
            %async_token_36, %results_37 = air.execute [%arg20] -> (index) {
              %12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %12 : index
            } {id = 16 : i32}
            %async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %9 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_45 = arith.constant 0 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c0_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            } else {
              %c32_45 = arith.constant 32 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c32_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 - 1 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            }
            %async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %10 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_45 = arith.constant 0 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c0_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>, id = 5 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            } else {
              %c32_45 = arith.constant 32 : index
              %c0_46 = arith.constant 0 : index
              %c0_47 = arith.constant 0 : index
              %c0_48 = arith.constant 0 : index
              %c0_49 = arith.constant 0 : index
              %12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c32_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 - 1 == 0)>, id = 6 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
              affine.yield %12 : !air.async.token
            }
            %async_token_42 = air.execute [%10, %9, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_45: i32, %out: i32):
                %12 = arith.muli %in, %in_45 : i32
                %13 = arith.addi %out, %12 : i32
                linalg.yield %13 : i32
              }
            } {id = 19 : i32}
            %async_token_43 = air.execute [%async_token_42] {
              memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_44 = air.execute [%async_token_42] {
              memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %11 = air.wait_all async [%arg20, %async_token_42]  {id = 1 : i32}
            scf.yield %11 : !air.async.token
          }
          %8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 7 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_35 = air.execute [%8] {
            memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %5 = air.dma_memcpy_nd async [%4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 8 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
        %async_token_18 = air.execute [%4] {
          memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_19 = air.execute [%4] {
          memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_20 = air.execute [%5] {
          memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
      %c2048 = arith.constant 2048 : index
      %c64_10 = arith.constant 64 : index
      %c1_11 = arith.constant 1 : index
      %c512_12 = arith.constant 512 : index
      %c0_13 = arith.constant 0 : index
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %2 = air.channel.put async [%async_token_14]  @channel_5[] (%arg5[%c0_13, %results_15] [%c512_12, %c64_10] [%c2048, %c1_11]) : (memref<512x2048xi32>)
      %c2048_16 = arith.constant 2048 : index
      %c64_17 = arith.constant 64 : index
      %c1_18 = arith.constant 1 : index
      %async_token_19, %results_20 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %async_token_21, %results_22 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %3 = air.channel.get async [%async_token_19, %async_token_21]  @channel_7[] (%arg6[%results_20, %results_22] [%c64_17, %c64_17] [%c2048_16, %c1_18]) : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
        %c4096 = arith.constant 4096 : index
        %c2048_23 = arith.constant 2048 : index
        %c64_24 = arith.constant 64 : index
        %c1_25 = arith.constant 1 : index
        %c512_26 = arith.constant 512 : index
        %c0_27 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_28, %results_29 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %23 : index
        } {id = 7 : i32}
        %async_token_30, %results_31 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %23 : index
        } {id = 8 : i32}
        %async_token_32, %results_33 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %5 = air.channel.get async [%async_token_28, %async_token_32]  @channel_4[] (%results_33[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_34, %results_35 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %6 = air.channel.get async [%async_token_30, %async_token_34]  @channel_5[] (%results_35[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_36, %results_37 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %c0_38 = arith.constant 0 : index
        %c1_39 = arith.constant 1 : index
        %c512_40 = arith.constant 512 : index
        %c2048_41 = arith.constant 2048 : index
        %c8 = arith.constant 8 : index
        %c32768 = arith.constant 32768 : index
        %c0_42 = arith.constant 0 : index
        %c64_43 = arith.constant 64 : index
        %c4 = arith.constant 4 : index
        %async_token_44, %results_45 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_46, %results_47 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_48, %results_49 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %7 = air.wait_all async
        %8 = air.wait_all async [%async_token_32, %5, %async_token_44, %async_token_46, %7]  {id = 2 : i32}
        %9 = scf.for %arg12 = %c0_42 to %c64_43 step %c4 iter_args(%arg13 = %8) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c4_100 = arith.constant 4 : index
          %c8_101 = arith.constant 8 : index
          %c32768_102 = arith.constant 32768 : index
          %c2048_103 = arith.constant 2048 : index
          %c512_104 = arith.constant 512 : index
          %c64_105 = arith.constant 64 : index
          %async_token_106, %results_107 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c0_108 = arith.constant 0 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %c0_112 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_106, %async_token_32, %arg13]  @channel_0[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c0_108, %results_107] [%c1_39, %c1_39, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_41, %c512_40, %c1_39]) : (memref<1x1x64x512xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c1_50 = arith.constant 1 : index
        %c512_51 = arith.constant 512 : index
        %c2048_52 = arith.constant 2048 : index
        %c8_53 = arith.constant 8 : index
        %c32768_54 = arith.constant 32768 : index
        %c0_55 = arith.constant 0 : index
        %c64_56 = arith.constant 64 : index
        %c4_57 = arith.constant 4 : index
        %async_token_58, %results_59 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_60, %results_61 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_62, %results_63 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %10 = air.wait_all async
        %11 = air.wait_all async [%async_token_32, %5, %async_token_58, %async_token_60, %10]  {id = 2 : i32}
        %12 = scf.for %arg12 = %c0_55 to %c64_56 step %c4_57 iter_args(%arg13 = %11) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c4_100 = arith.constant 4 : index
          %c8_101 = arith.constant 8 : index
          %c32768_102 = arith.constant 32768 : index
          %c2048_103 = arith.constant 2048 : index
          %c512_104 = arith.constant 512 : index
          %c64_105 = arith.constant 64 : index
          %async_token_106, %results_107 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c32_108 = arith.constant 32 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %c0_112 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_106, %async_token_32, %arg13]  @channel_1[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c32_108, %results_107] [%c1_50, %c1_50, %c4_57, %c8_53, %c4_57, %c8_53] [%c32768_54, %c32768_54, %c8_53, %c2048_52, %c512_51, %c1_50]) : (memref<1x1x64x512xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c0_64 = arith.constant 0 : index
        %c1_65 = arith.constant 1 : index
        %c512_66 = arith.constant 512 : index
        %c8_67 = arith.constant 8 : index
        %c32768_68 = arith.constant 32768 : index
        %c0_69 = arith.constant 0 : index
        %c64_70 = arith.constant 64 : index
        %c4_71 = arith.constant 4 : index
        %async_token_72, %results_73 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_74, %results_75 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_76, %results_77 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %13 = air.wait_all async
        %14 = air.wait_all async [%async_token_34, %6, %async_token_72, %async_token_74, %13]  {id = 2 : i32}
        %15 = scf.for %arg12 = %c0_69 to %c64_70 step %c4_71 iter_args(%arg13 = %14) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c8_100 = arith.constant 8 : index
          %c4_101 = arith.constant 4 : index
          %c32768_102 = arith.constant 32768 : index
          %c512_103 = arith.constant 512 : index
          %c64_104 = arith.constant 64 : index
          %async_token_105, %results_106 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c0_107 = arith.constant 0 : index
          %c0_108 = arith.constant 0 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_105, %async_token_34, %arg13]  @channel_2[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c0_107] [%c1_65, %c1_65, %c8_67, %c4_71, %c8_67, %c4_71] [%c32768_68, %c32768_68, %c4_71, %c512_66, %c64_70, %c1_65]) : (memref<1x1x512x64xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c1_78 = arith.constant 1 : index
        %c512_79 = arith.constant 512 : index
        %c8_80 = arith.constant 8 : index
        %c32768_81 = arith.constant 32768 : index
        %c0_82 = arith.constant 0 : index
        %c64_83 = arith.constant 64 : index
        %c4_84 = arith.constant 4 : index
        %async_token_85, %results_86 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 12 : i32}
        %async_token_87, %results_88 = air.execute -> (index) {
          %23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
          air.execute_terminator %23 : index
        } {id = 13 : i32}
        %async_token_89, %results_90 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %16 = air.wait_all async
        %17 = air.wait_all async [%async_token_34, %6, %async_token_85, %async_token_87, %16]  {id = 2 : i32}
        %18 = scf.for %arg12 = %c0_82 to %c64_83 step %c4_84 iter_args(%arg13 = %17) -> (!air.async.token) {
          %c1_99 = arith.constant 1 : index
          %c8_100 = arith.constant 8 : index
          %c4_101 = arith.constant 4 : index
          %c32768_102 = arith.constant 32768 : index
          %c512_103 = arith.constant 512 : index
          %c64_104 = arith.constant 64 : index
          %async_token_105, %results_106 = air.execute [%arg13] -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 16 : i32}
          %c32_107 = arith.constant 32 : index
          %c0_108 = arith.constant 0 : index
          %c0_109 = arith.constant 0 : index
          %c0_110 = arith.constant 0 : index
          %c0_111 = arith.constant 0 : index
          %23 = air.channel.put async [%async_token_105, %async_token_34, %arg13]  @channel_3[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c32_107] [%c1_78, %c1_78, %c8_80, %c4_84, %c8_80, %c4_84] [%c32768_81, %c32768_81, %c4_84, %c512_79, %c64_83, %c1_78]) : (memref<1x1x512x64xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.yield %24 : !air.async.token
        }
        %c1_91 = arith.constant 1 : index
        %c0_92 = arith.constant 0 : index
        %c0_93 = arith.constant 0 : index
        %c2_94 = arith.constant 2 : index
        %c2_95 = arith.constant 2 : index
        %19 = air.wait_all async [%async_token_36]
        %20 = scf.parallel (%arg12, %arg13) = (%c0_92, %c0_93) to (%c2_94, %c2_95) step (%c1_91, %c1_91) init (%19) -> !air.async.token {
          %c32_99 = arith.constant 32 : index
          %c4096_100 = arith.constant 4096 : index
          %c1_101 = arith.constant 1 : index
          %c0_102 = arith.constant 0 : index
          %c64_103 = arith.constant 64 : index
          %async_token_104, %results_105 = air.execute -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %25 : index
          } {id = 12 : i32}
          %async_token_106, %results_107 = air.execute -> (index) {
            %25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %25 : index
          } {id = 13 : i32}
          %23 = air.channel.get async [%async_token_106, %async_token_104, %async_token_36, %19]  @channel_6[%arg12, %arg13] (%results_37[%c0_102, %c0_102, %results_105, %results_107] [%c1_101, %c1_101, %c32_99, %c32_99] [%c4096_100, %c4096_100, %c64_103, %c1_101]) : (memref<1x1x64x64xi32, 1 : i32>)
          %24 = air.wait_all async [%23]
          scf.reduce(%24 : !air.async.token) {
          ^bb0(%arg14: !air.async.token, %arg15: !air.async.token):
            %25 = air.wait_all async [%arg14, %arg15]
            scf.reduce.return %25 : !air.async.token
          }
        }
        %21 = air.herd @herd_0 async [%5, %6, %async_token_36]  tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_37, %arg17=%results_33, %arg18=%results_35) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
          %c32_99 = arith.constant 32 : index
          %c4096_100 = arith.constant 4096 : index
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_101 = arith.constant 1 : index
          %c512_102 = arith.constant 512 : index
          %c2048_103 = arith.constant 2048 : index
          %c8_104 = arith.constant 8 : index
          %c32768_105 = arith.constant 32768 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_106 = arith.constant 0 : index
          %c64_107 = arith.constant 64 : index
          %c4_108 = arith.constant 4 : index
          %async_token_109, %results_110 = air.execute -> (index) {
            %26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
            air.execute_terminator %26 : index
          } {id = 12 : i32}
          %async_token_111, %results_112 = air.execute -> (index) {
            %26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
            air.execute_terminator %26 : index
          } {id = 13 : i32}
          %async_token_113, %results_114 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_115 = air.execute [%async_token_113] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %23 = air.wait_all async [%async_token_109, %async_token_111, %async_token_115]  {id = 2 : i32}
          %24 = scf.for %arg19 = %c0_106 to %c64_107 step %c4_108 iter_args(%arg20 = %23) -> (!air.async.token) {
            %async_token_117, %results_118 = air.execute [%arg20] -> (index) {
              %29 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
              air.execute_terminator %29 : index
            } {id = 16 : i32}
            %async_token_119, %results_120 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %26 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_126 = arith.constant 0 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_119, %async_token_117, %arg20]  @channel_0[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            } else {
              %c32_126 = arith.constant 32 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_119, %async_token_117, %arg20]  @channel_1[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            }
            %async_token_121, %results_122 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %27 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
              %c0_126 = arith.constant 0 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_121, %async_token_117, %arg20]  @channel_2[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            } else {
              %c32_126 = arith.constant 32 : index
              %c0_127 = arith.constant 0 : index
              %c0_128 = arith.constant 0 : index
              %c0_129 = arith.constant 0 : index
              %c0_130 = arith.constant 0 : index
              %29 = air.channel.get async [%async_token_121, %async_token_117, %arg20]  @channel_3[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %29 : !air.async.token
            }
            %async_token_123 = air.execute [%27, %26, %arg20] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_120, %results_122 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_126: i32, %out: i32):
                %29 = arith.muli %in, %in_126 : i32
                %30 = arith.addi %out, %29 : i32
                linalg.yield %30 : i32
              }
            } {id = 19 : i32}
            %async_token_124 = air.execute [%async_token_123] {
              memref.dealloc %results_120 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_125 = air.execute [%async_token_123] {
              memref.dealloc %results_122 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %28 = air.wait_all async [%arg20, %async_token_123]  {id = 1 : i32}
            scf.yield %28 : !air.async.token
          }
          %25 = air.channel.put async [%24]  @channel_6[%arg12, %arg13] (%results_114[%c0_106, %c0_106, %c0_106, %c0_106, %c0_106, %c0_106] [%c1_101, %c1_101, %c8_104, %c4_108, %c8_104, %c4_108] [%c1024, %c1024, %c16, %c4_108, %c128, %c1_101]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_116 = air.execute [%25] {
            memref.dealloc %results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %22 = air.channel.put async [%21]  @channel_7[] (%results_37[%c0_27, %c0_27, %c0_27, %c0_27] [%c1_25, %c1_25, %c64_24, %c64_24] [%c4096, %c4096, %c64_24, %c1_25]) : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_96 = air.execute [%21] {
          memref.dealloc %results_33 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_97 = air.execute [%21] {
          memref.dealloc %results_35 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_98 = air.execute [%22] {
          memref.dealloc %results_37 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before CSE (cse) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %c2048 = arith.constant 2048 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
      %async_token_10, %results_11 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %2 = air.channel.put async [%async_token_10]  @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
      %async_token_12, %results_13 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %3 = air.channel.get async [%async_token_12, %async_token_14]  @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
        %c32_16 = arith.constant 32 : index
        %c4 = arith.constant 4 : index
        %c32768 = arith.constant 32768 : index
        %c8 = arith.constant 8 : index
        %c4096 = arith.constant 4096 : index
        %c2048_17 = arith.constant 2048 : index
        %c64_18 = arith.constant 64 : index
        %c1_19 = arith.constant 1 : index
        %c512_20 = arith.constant 512 : index
        %c0_21 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_22, %results_23 = air.execute -> (index) {
          %18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %18 : index
        } {id = 7 : i32}
        %async_token_24, %results_25 = air.execute -> (index) {
          %18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %18 : index
        } {id = 8 : i32}
        %async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %5 = air.channel.get async [%async_token_22, %async_token_26]  @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %6 = air.channel.get async [%async_token_24, %async_token_28]  @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %7 = air.wait_all async [%async_token_26, %5]
        %8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %7) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %9 = air.wait_all async [%async_token_26, %5]
        %10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %9) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %11 = air.wait_all async [%async_token_28, %6]
        %12 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %11) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %13 = air.wait_all async [%async_token_28, %6]
        %14 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %13) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %15 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
          %async_token_43, %results_44 = air.execute -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 12 : i32}
          %async_token_45, %results_46 = air.execute -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %19 : index
          } {id = 13 : i32}
          %18 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30]  @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
          scf.reduce(%18 : !air.async.token) {
          ^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
            %19 = air.wait_all async [%arg11, %arg12]
            scf.reduce.return %19 : !air.async.token
          }
        }
        %16 = air.herd @herd_0 async [%5, %6, %async_token_30]  tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_43 = arith.constant 1 : index
          %c8_44 = arith.constant 8 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_45 = arith.constant 0 : index
          %c64_46 = arith.constant 64 : index
          %c4_47 = arith.constant 4 : index
          %async_token_48, %results_49 = air.execute -> (index) {
            %21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %21 : index
          } {id = 12 : i32}
          %async_token_50, %results_51 = air.execute -> (index) {
            %21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %21 : index
          } {id = 13 : i32}
          %async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_54 = air.execute [%async_token_52] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %18 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54]  {id = 2 : i32}
          %19 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %18) -> (!air.async.token) {
            %async_token_56, %results_57 = air.execute [%arg14] -> (index) {
              %24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
              air.execute_terminator %24 : index
            } {id = 16 : i32}
            %async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %21 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
              %24 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            } else {
              %24 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            }
            %async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %22 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
              %24 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            } else {
              %24 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            }
            %async_token_62 = air.execute [%22, %21, %arg14] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_65: i32, %out: i32):
                %24 = arith.muli %in, %in_65 : i32
                %25 = arith.addi %out, %24 : i32
                linalg.yield %25 : i32
              }
            } {id = 19 : i32}
            %async_token_63 = air.execute [%async_token_62] {
              memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_64 = air.execute [%async_token_62] {
              memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %23 = air.wait_all async [%arg14, %async_token_62]  {id = 1 : i32}
            scf.yield %23 : !air.async.token
          }
          %20 = air.channel.put async [%19]  @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_55 = air.execute [%20] {
            memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %17 = air.channel.put async [%16]  @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_40 = air.execute [%16] {
          memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_41 = air.execute [%16] {
          memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_42 = air.execute [%17] {
          memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before AIRDependencyCanonicalize (air-dependency-canonicalize) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    } {id = 1 : i32}
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    } {id = 2 : i32}
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    } {id = 3 : i32}
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    } {id = 4 : i32}
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
      air.execute_terminator %1 : memref<2048x2048xi32>
    } {id = 5 : i32}
    %async_token_6 = air.execute [%async_token_4] {
      memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
    } {id = 6 : i32}
    %0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
      %c2048 = arith.constant 2048 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c512 = arith.constant 512 : index
      %c0_7 = arith.constant 0 : index
      %async_token_8, %results_9 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %1 = air.channel.put async [%async_token_8]  @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
      %async_token_10, %results_11 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %2 = air.channel.put async [%async_token_10]  @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
      %async_token_12, %results_13 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
        air.execute_terminator %5 : index
      } {id = 7 : i32}
      %async_token_14, %results_15 = air.execute -> (index) {
        %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
        air.execute_terminator %5 : index
      } {id = 8 : i32}
      %3 = air.channel.get async [%async_token_12, %async_token_14]  @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
      %4 = air.segment @segment_0 async  args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
        %c32_16 = arith.constant 32 : index
        %c4 = arith.constant 4 : index
        %c32768 = arith.constant 32768 : index
        %c8 = arith.constant 8 : index
        %c4096 = arith.constant 4096 : index
        %c2048_17 = arith.constant 2048 : index
        %c64_18 = arith.constant 64 : index
        %c1_19 = arith.constant 1 : index
        %c512_20 = arith.constant 512 : index
        %c0_21 = arith.constant 0 : index
        %c2 = arith.constant 2 : index
        %async_token_22, %results_23 = air.execute -> (index) {
          %18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
          air.execute_terminator %18 : index
        } {id = 7 : i32}
        %async_token_24, %results_25 = air.execute -> (index) {
          %18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
          air.execute_terminator %18 : index
        } {id = 8 : i32}
        %async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
        } {id = 9 : i32}
        %5 = air.channel.get async [%async_token_22, %async_token_26]  @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
        %async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
        } {id = 10 : i32}
        %6 = air.channel.get async [%async_token_24, %async_token_28]  @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
        %async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
          %alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
          air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
        } {id = 11 : i32}
        %async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %7 = air.wait_all async [%async_token_26, %5]
        %8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %7) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %9 = air.wait_all async [%async_token_26, %5]
        %10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %9) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_26, %arg10]  @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %11 = air.wait_all async [%async_token_28, %6]
        %12 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %11) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
          %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
          air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
        } {id = 14 : i32}
        %13 = air.wait_all async [%async_token_28, %6]
        %14 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %13) -> (!air.async.token) {
          %async_token_43, %results_44 = air.execute [%arg10] -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 16 : i32}
          %18 = air.channel.put async [%async_token_43, %async_token_28, %arg10]  @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
          scf.yield %18 : !air.async.token
        }
        %15 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
          %async_token_43, %results_44 = air.execute -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %19 : index
          } {id = 12 : i32}
          %async_token_45, %results_46 = air.execute -> (index) {
            %19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %19 : index
          } {id = 13 : i32}
          %18 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30]  @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
          scf.reduce(%18 : !air.async.token) {
          ^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
            %19 = air.wait_all async [%arg11, %arg12]
            scf.reduce.return %19 : !air.async.token
          }
        }
        %16 = air.herd @herd_0 async [%5, %6, %async_token_30]  tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
          %c128 = arith.constant 128 : index
          %c16 = arith.constant 16 : index
          %c1024 = arith.constant 1024 : index
          %c1_43 = arith.constant 1 : index
          %c8_44 = arith.constant 8 : index
          %c0_i32 = arith.constant 0 : i32
          %c0_45 = arith.constant 0 : index
          %c64_46 = arith.constant 64 : index
          %c4_47 = arith.constant 4 : index
          %async_token_48, %results_49 = air.execute -> (index) {
            %21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
            air.execute_terminator %21 : index
          } {id = 12 : i32}
          %async_token_50, %results_51 = air.execute -> (index) {
            %21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
            air.execute_terminator %21 : index
          } {id = 13 : i32}
          %async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
            %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
            air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 14 : i32}
          %async_token_54 = air.execute [%async_token_52] {
            linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
          } {id = 15 : i32}
          %18 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54]  {id = 2 : i32}
          %19 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %18) -> (!air.async.token) {
            %async_token_56, %results_57 = air.execute [%arg14] -> (index) {
              %24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
              air.execute_terminator %24 : index
            } {id = 16 : i32}
            %async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 17 : i32}
            %21 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
              %24 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            } else {
              %24 = air.channel.get async [%async_token_58, %async_token_56, %arg14]  @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            }
            %async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
              %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
              air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 18 : i32}
            %22 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
              %24 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            } else {
              %24 = air.channel.get async [%async_token_60, %async_token_56, %arg14]  @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
              affine.yield %24 : !air.async.token
            }
            %async_token_62 = air.execute [%22, %21, %arg14] {
              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
              ^bb0(%in: i32, %in_65: i32, %out: i32):
                %24 = arith.muli %in, %in_65 : i32
                %25 = arith.addi %out, %24 : i32
                linalg.yield %25 : i32
              }
            } {id = 19 : i32}
            %async_token_63 = air.execute [%async_token_62] {
              memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
            } {id = 20 : i32}
            %async_token_64 = air.execute [%async_token_62] {
              memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
            } {id = 21 : i32}
            %23 = air.wait_all async [%arg14, %async_token_62]  {id = 1 : i32}
            scf.yield %23 : !air.async.token
          }
          %20 = air.channel.put async [%19]  @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
          %async_token_55 = air.execute [%20] {
            memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
          } {id = 22 : i32}
          air.herd_terminator
        }
        %17 = air.channel.put async [%16]  @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
        %async_token_40 = air.execute [%16] {
          memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
        } {id = 23 : i32}
        %async_token_41 = air.execute [%16] {
          memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
        } {id = 24 : i32}
        %async_token_42 = air.execute [%17] {
          memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
        } {id = 25 : i32}
        air.segment_terminator
      }
      air.launch_terminator
    }
    return
  }
}

// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
  air.channel @channel_7 [1, 1]
  air.channel @channel_6 [2, 2]
  air.channel @channel_5 [1, 1]
  air.channel @channel_4 [1, 1]
  air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
  air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
  air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
  func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
    %c32 = arith.constant 32 : index
    %c0 = arith.constant 0 : index
    %async_token, %results = air.execute -> (memref<2048x512xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
      air.execute_terminator %1 : memref<2048x512xi32>
    }
    %async_token_0 = air.execute [%async_token] {
      memref.assume_alignment %results, 64 : memref<2048x512xi32>
    }
    %async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
      air.execute_terminator %1 : memref<512x2048xi32>
    }
    %async_token_3 = air.execute [%async_token_1] {
      memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
    }
    %async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
      %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0)