yzhang93/pad-pack pipeline IR Secret

## pad-pack pipeline IR
#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd"}>
#map = affine_map<(d0) -> (d0 * 64)>
#map1 = affine_map<(d0) -> (d0 * 32)>
#map2 = affine_map<(d0) -> (d0 * 8)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__transform_main>
#device_target_amd_aie = #hal.device.target<"amd-aie", {executable_targets = [#executable_target_amdaie_xclbin_fb], legacy_sync}>
module attributes {hal.device.targets = [#device_target_amd_aie]} {
  hal.executable private @matmul_example_dispatch_0 {
    hal.executable.variant public @amdaie_xclbin_fb target(#executable_target_amdaie_xclbin_fb) {
      hal.executable.export public @matmul_example_dispatch_0_matmul_2048x2048x2048_i32 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
      ^bb0(%arg0: !hal.device):
        %c32 = arith.constant 32 : index
        %c1 = arith.constant 1 : index
        hal.return %c32, %c32, %c1 : index, index, index
      }
      builtin.module {
        func.func @matmul_example_dispatch_0_matmul_2048x2048x2048_i32() {
          %c4 = arith.constant 4 : index
          %c256 = arith.constant 256 : index
          %c2048 = arith.constant 2048 : index
          %c0_i32 = arith.constant 0 : i32
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
          memref.assume_alignment %0, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
          memref.assume_alignment %1, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
          memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
          scf.forall (%arg0, %arg1) in (32, 32) {
            %3 = affine.apply #map(%arg0)
            %4 = affine.apply #map(%arg1)
            %subview = memref.subview %2[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
            %alloc = memref.alloc() : memref<64x2048xi32, 1>
            scf.for %arg2 = %c0 to %c2048 step %c256 {
              %5 = affine.apply #map(%arg0)
              %subview_2 = memref.subview %0[%5, %arg2] [64, 256] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_3 = memref.subview %alloc[0, %arg2] [64, 256] [1, 1] : memref<64x2048xi32, 1> to memref<64x256xi32, strided<[2048, 1], offset: ?>, 1>
              linalg.copy ins(%subview_2 : memref<64x256xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<64x256xi32, strided<[2048, 1], offset: ?>, 1>)
            }
            %alloc_0 = memref.alloc() : memref<2048x64xi32, 1>
            scf.for %arg2 = %c0 to %c2048 step %c256 {
              %5 = affine.apply #map(%arg1)
              %subview_2 = memref.subview %1[%arg2, %5] [256, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
              %subview_3 = memref.subview %alloc_0[%arg2, 0] [256, 64] [1, 1] : memref<2048x64xi32, 1> to memref<256x64xi32, strided<[64, 1], offset: ?>, 1>
              linalg.copy ins(%subview_2 : memref<256x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<256x64xi32, strided<[64, 1], offset: ?>, 1>)
            }
            %alloc_1 = memref.alloc() : memref<64x64xi32, 1>
            scf.forall (%arg2, %arg3) in (2, 2) {
              %5 = affine.apply #map1(%arg2)
              %6 = affine.apply #map1(%arg3)
              %subview_2 = memref.subview %alloc_1[%5, %6] [32, 32] [1, 1] : memref<64x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
              %alloc_3 = memref.alloc() : memref<4x8x4x8xi32, 2>
              linalg.fill ins(%c0_i32 : i32) outs(%alloc_3 : memref<4x8x4x8xi32, 2>)
              scf.for %arg4 = %c0 to %c256 step %c4 {
                %7 = affine.apply #map1(%arg2)
                %8 = affine.apply #map2(%arg4)
                %subview_4 = memref.subview %alloc[%7, %8] [32, 32] [1, 1] : memref<64x2048xi32, 1> to memref<32x32xi32, strided<[2048, 1], offset: ?>, 1>
                %alloc_5 = memref.alloc() : memref<4x8x4x8xi32, 2>
                iree_linalg_ext.pack %subview_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_5 : (memref<32x32xi32, strided<[2048, 1], offset: ?>, 1> memref<4x8x4x8xi32, 2>)
                %9 = affine.apply #map2(%arg4)
                %10 = affine.apply #map1(%arg3)
                %subview_6 = memref.subview %alloc_0[%9, %10] [32, 32] [1, 1] : memref<2048x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
                %alloc_7 = memref.alloc() : memref<4x4x8x8xi32, 2>
                iree_linalg_ext.pack %subview_6 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc_7 : (memref<32x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x4x8x8xi32, 2>)
                linalg.generic {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_5, %alloc_7 : memref<4x8x4x8xi32, 2>, memref<4x4x8x8xi32, 2>) outs(%alloc_3 : memref<4x8x4x8xi32, 2>) {
                ^bb0(%in: i32, %in_8: i32, %out: i32):
                  %11 = arith.muli %in, %in_8 : i32
                  %12 = arith.addi %out, %11 : i32
                  linalg.yield %12 : i32
                }
                memref.dealloc %alloc_5 : memref<4x8x4x8xi32, 2>
                memref.dealloc %alloc_7 : memref<4x4x8x8xi32, 2>
              }
              iree_linalg_ext.unpack %alloc_3 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_2 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>)
              memref.dealloc %alloc_3 : memref<4x8x4x8xi32, 2>
            } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
            linalg.copy ins(%alloc_1 : memref<64x64xi32, 1>) outs(%subview : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
            memref.dealloc %alloc : memref<64x2048xi32, 1>
            memref.dealloc %alloc_0 : memref<2048x64xi32, 1>
            memref.dealloc %alloc_1 : memref<64x64xi32, 1>
          } {mapping = [#gpu.block<y>, #gpu.block<x>]}
          return
        }
      }
    }
  }
}
	#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd"}>
	#map = affine_map<(d0) -> (d0 * 64)>
	#map1 = affine_map<(d0) -> (d0 * 32)>
	#map2 = affine_map<(d0) -> (d0 * 8)>
	#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
	#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
	#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
	#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
	#translation = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__transform_main>
	#device_target_amd_aie = #hal.device.target<"amd-aie", {executable_targets = [#executable_target_amdaie_xclbin_fb], legacy_sync}>
	module attributes {hal.device.targets = [#device_target_amd_aie]} {
	hal.executable private @matmul_example_dispatch_0 {
	hal.executable.variant public @amdaie_xclbin_fb target(#executable_target_amdaie_xclbin_fb) {
	hal.executable.export public @matmul_example_dispatch_0_matmul_2048x2048x2048_i32 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
	^bb0(%arg0: !hal.device):
	%c32 = arith.constant 32 : index
	%c1 = arith.constant 1 : index
	hal.return %c32, %c32, %c1 : index, index, index
	}
	builtin.module {
	func.func @matmul_example_dispatch_0_matmul_2048x2048x2048_i32() {
	%c4 = arith.constant 4 : index
	%c256 = arith.constant 256 : index
	%c2048 = arith.constant 2048 : index
	%c0_i32 = arith.constant 0 : i32
	%c0 = arith.constant 0 : index
	%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
	memref.assume_alignment %0, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
	%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
	memref.assume_alignment %1, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
	%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
	memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
	scf.forall (%arg0, %arg1) in (32, 32) {
	%3 = affine.apply #map(%arg0)
	%4 = affine.apply #map(%arg1)
	%subview = memref.subview %2[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
	%alloc = memref.alloc() : memref<64x2048xi32, 1>
	scf.for %arg2 = %c0 to %c2048 step %c256 {
	%5 = affine.apply #map(%arg0)
	%subview_2 = memref.subview %0[%5, %arg2] [64, 256] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
	%subview_3 = memref.subview %alloc[0, %arg2] [64, 256] [1, 1] : memref<64x2048xi32, 1> to memref<64x256xi32, strided<[2048, 1], offset: ?>, 1>
	linalg.copy ins(%subview_2 : memref<64x256xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<64x256xi32, strided<[2048, 1], offset: ?>, 1>)
	}
	%alloc_0 = memref.alloc() : memref<2048x64xi32, 1>
	scf.for %arg2 = %c0 to %c2048 step %c256 {
	%5 = affine.apply #map(%arg1)
	%subview_2 = memref.subview %1[%arg2, %5] [256, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
	%subview_3 = memref.subview %alloc_0[%arg2, 0] [256, 64] [1, 1] : memref<2048x64xi32, 1> to memref<256x64xi32, strided<[64, 1], offset: ?>, 1>
	linalg.copy ins(%subview_2 : memref<256x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<256x64xi32, strided<[64, 1], offset: ?>, 1>)
	}
	%alloc_1 = memref.alloc() : memref<64x64xi32, 1>
	scf.forall (%arg2, %arg3) in (2, 2) {
	%5 = affine.apply #map1(%arg2)
	%6 = affine.apply #map1(%arg3)
	%subview_2 = memref.subview %alloc_1[%5, %6] [32, 32] [1, 1] : memref<64x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
	%alloc_3 = memref.alloc() : memref<4x8x4x8xi32, 2>
	linalg.fill ins(%c0_i32 : i32) outs(%alloc_3 : memref<4x8x4x8xi32, 2>)
	scf.for %arg4 = %c0 to %c256 step %c4 {
	%7 = affine.apply #map1(%arg2)
	%8 = affine.apply #map2(%arg4)
	%subview_4 = memref.subview %alloc[%7, %8] [32, 32] [1, 1] : memref<64x2048xi32, 1> to memref<32x32xi32, strided<[2048, 1], offset: ?>, 1>
	%alloc_5 = memref.alloc() : memref<4x8x4x8xi32, 2>
	iree_linalg_ext.pack %subview_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_5 : (memref<32x32xi32, strided<[2048, 1], offset: ?>, 1> memref<4x8x4x8xi32, 2>)
	%9 = affine.apply #map2(%arg4)
	%10 = affine.apply #map1(%arg3)
	%subview_6 = memref.subview %alloc_0[%9, %10] [32, 32] [1, 1] : memref<2048x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
	%alloc_7 = memref.alloc() : memref<4x4x8x8xi32, 2>
	iree_linalg_ext.pack %subview_6 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc_7 : (memref<32x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x4x8x8xi32, 2>)
	linalg.generic {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_5, %alloc_7 : memref<4x8x4x8xi32, 2>, memref<4x4x8x8xi32, 2>) outs(%alloc_3 : memref<4x8x4x8xi32, 2>) {
	^bb0(%in: i32, %in_8: i32, %out: i32):
	%11 = arith.muli %in, %in_8 : i32
	%12 = arith.addi %out, %11 : i32
	linalg.yield %12 : i32
	}
	memref.dealloc %alloc_5 : memref<4x8x4x8xi32, 2>
	memref.dealloc %alloc_7 : memref<4x4x8x8xi32, 2>
	}
	iree_linalg_ext.unpack %alloc_3 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_2 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>)
	memref.dealloc %alloc_3 : memref<4x8x4x8xi32, 2>
	} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
	linalg.copy ins(%alloc_1 : memref<64x64xi32, 1>) outs(%subview : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
	memref.dealloc %alloc : memref<64x2048xi32, 1>
	memref.dealloc %alloc_0 : memref<2048x64xi32, 1>
	memref.dealloc %alloc_1 : memref<64x64xi32, 1>
	} {mapping = [#gpu.block<y>, #gpu.block<x>]}
	return
	}
	}
	}
	}
	}