Skip to content

Instantly share code, notes, and snippets.

@yzhang93
Last active February 27, 2024 05:27
Show Gist options
  • Save yzhang93/b4cf2bfa8f4a0ab0b09c459b368fcffe to your computer and use it in GitHub Desktop.
Save yzhang93/b4cf2bfa8f4a0ab0b09c459b368fcffe to your computer and use it in GitHub Desktop.
#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd"}>
#map = affine_map<(d0) -> (d0 * 64)>
#map1 = affine_map<(d0) -> (d0 * 32)>
#map2 = affine_map<(d0) -> (d0 * 8)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__transform_main>
#device_target_amd_aie = #hal.device.target<"amd-aie", {executable_targets = [#executable_target_amdaie_xclbin_fb], legacy_sync}>
module attributes {hal.device.targets = [#device_target_amd_aie]} {
hal.executable private @matmul_example_dispatch_0 {
hal.executable.variant public @amdaie_xclbin_fb target(#executable_target_amdaie_xclbin_fb) {
hal.executable.export public @matmul_example_dispatch_0_matmul_2048x2048x2048_i32 ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
^bb0(%arg0: !hal.device):
%c32 = arith.constant 32 : index
%c1 = arith.constant 1 : index
hal.return %c32, %c32, %c1 : index, index, index
}
builtin.module {
func.func @matmul_example_dispatch_0_matmul_2048x2048x2048_i32() {
%c4 = arith.constant 4 : index
%c256 = arith.constant 256 : index
%c2048 = arith.constant 2048 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) in (32, 32) {
%3 = affine.apply #map(%arg0)
%4 = affine.apply #map(%arg1)
%subview = memref.subview %2[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<64x2048xi32, 1>
scf.for %arg2 = %c0 to %c2048 step %c256 {
%5 = affine.apply #map(%arg0)
%subview_2 = memref.subview %0[%5, %arg2] [64, 256] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x256xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %alloc[0, %arg2] [64, 256] [1, 1] : memref<64x2048xi32, 1> to memref<64x256xi32, strided<[2048, 1], offset: ?>, 1>
linalg.copy ins(%subview_2 : memref<64x256xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<64x256xi32, strided<[2048, 1], offset: ?>, 1>)
}
%alloc_0 = memref.alloc() : memref<2048x64xi32, 1>
scf.for %arg2 = %c0 to %c2048 step %c256 {
%5 = affine.apply #map(%arg1)
%subview_2 = memref.subview %1[%arg2, %5] [256, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<256x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_3 = memref.subview %alloc_0[%arg2, 0] [256, 64] [1, 1] : memref<2048x64xi32, 1> to memref<256x64xi32, strided<[64, 1], offset: ?>, 1>
linalg.copy ins(%subview_2 : memref<256x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<256x64xi32, strided<[64, 1], offset: ?>, 1>)
}
%alloc_1 = memref.alloc() : memref<64x64xi32, 1>
scf.forall (%arg2, %arg3) in (2, 2) {
%5 = affine.apply #map1(%arg2)
%6 = affine.apply #map1(%arg3)
%subview_2 = memref.subview %alloc_1[%5, %6] [32, 32] [1, 1] : memref<64x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
%alloc_3 = memref.alloc() : memref<4x8x4x8xi32, 2>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_3 : memref<4x8x4x8xi32, 2>)
scf.for %arg4 = %c0 to %c256 step %c4 {
%7 = affine.apply #map1(%arg2)
%8 = affine.apply #map2(%arg4)
%subview_4 = memref.subview %alloc[%7, %8] [32, 32] [1, 1] : memref<64x2048xi32, 1> to memref<32x32xi32, strided<[2048, 1], offset: ?>, 1>
%alloc_5 = memref.alloc() : memref<4x8x4x8xi32, 2>
iree_linalg_ext.pack %subview_4 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_5 : (memref<32x32xi32, strided<[2048, 1], offset: ?>, 1> memref<4x8x4x8xi32, 2>)
%9 = affine.apply #map2(%arg4)
%10 = affine.apply #map1(%arg3)
%subview_6 = memref.subview %alloc_0[%9, %10] [32, 32] [1, 1] : memref<2048x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
%alloc_7 = memref.alloc() : memref<4x4x8x8xi32, 2>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc_7 : (memref<32x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x4x8x8xi32, 2>)
linalg.generic {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_5, %alloc_7 : memref<4x8x4x8xi32, 2>, memref<4x4x8x8xi32, 2>) outs(%alloc_3 : memref<4x8x4x8xi32, 2>) {
^bb0(%in: i32, %in_8: i32, %out: i32):
%11 = arith.muli %in, %in_8 : i32
%12 = arith.addi %out, %11 : i32
linalg.yield %12 : i32
}
memref.dealloc %alloc_5 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_7 : memref<4x4x8x8xi32, 2>
}
iree_linalg_ext.unpack %alloc_3 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_2 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>)
memref.dealloc %alloc_3 : memref<4x8x4x8xi32, 2>
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
linalg.copy ins(%alloc_1 : memref<64x64xi32, 1>) outs(%subview : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<64x2048xi32, 1>
memref.dealloc %alloc_0 : memref<2048x64xi32, 1>
memref.dealloc %alloc_1 : memref<64x64xi32, 1>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment