Skip to content

Instantly share code, notes, and snippets.

@Max191
Max191 / igemm_conv.mlir
Created June 11, 2024 16:36
igemm conv with im2col op
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<1, storage_buffer>
]>
]>
#config_mm = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 4], thread = [1, 8, 4, 0]}>
@Max191
Max191 / before_after_elementwise_fusion.mlir
Last active June 11, 2024 13:56
transpose->dequant->extract->generic
// -----// IR Dump After CSE (cse) //----- //
util.func public @jit_eval_174(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_174(%input0: tensor<1024x7x7x2xi8>) -> (%output0: tensor<1024x7x7x2xi8>)"}} {
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant 4.000000e+00 : f32
%cst_2 = arith.constant 2.000000e+00 : f32
%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<1024x7x7x2xi8>
%1 = tensor.empty() : tensor<2x1024x7x7xi8>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<1024x7x7x2xi8>) outs(%1 : tensor<2x1024x7x7xi8>) {
@Max191
Max191 / pad_winograd.mlir
Created May 30, 2024 17:10
Pad + winograd input transform
module {
func.func @main$async_dispatch_1_winograd_input_transform_11x11x16x8x8xf32() attributes {translation_info = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c262144 = arith.constant 262144 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c262144) : !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>> -> tensor<16x64x64xf32>
%3 = tensor.empty() : tensor<11x11x16x8x8xbf16>
%4 = tensor.empty() : tensor<11x11x16x8x8xf32>
@Max191
Max191 / README.md
Created May 28, 2024 21:49
VAE Numerics testing
@Max191
Max191 / elementwise_pad.mlir
Created May 22, 2024 20:26
elementwise -> pad
%expanded_127 = tensor.expand_shape %collapsed_126 [[0], [1], [2, 3]] output_shape [1, 512, 128, 128] : tensor<1x512x16384xf32> into tensor<1x512x128x128xf32>
%23 = tensor.empty() : tensor<512x128x128xf32>
%expanded_128 = tensor.expand_shape %cst_89 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32>
%expanded_129 = tensor.expand_shape %cst_90 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32>
%24 = tensor.empty() : tensor<1x512x128x128xf32>
%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_127, %expanded_128, %expanded_129 : tensor<1x512x128x128xf32>, tensor<1x512xf32>, tensor<1x512xf32>) outs(%24 : tensor<1x512x128x128xf32>) {
^bb0(%in: f32, %in_741: f32, %in_742: f32, %out: f32):
%436 = arith.mulf
@Max191
Max191 / README.md
Last active May 21, 2024 21:09
IR reference for winograd ops
@Max191
Max191 / packing.mlir
Created May 21, 2024 19:33
Bad pack and unpack codegen with outer_dims_perm
module {
func.func @pack_bad(%arg0: tensor<29241x128x64xbf16>) -> tensor<64x1828x64x16x2xbf16> {
%cst = arith.constant 0.000000e+00 : bf16
%4 = tensor.empty() : tensor<64x1828x64x16x2xbf16>
%pack = tensor.pack %arg0 padding_value(%cst : bf16) outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %4 : tensor<29241x128x64xbf16> -> tensor<64x1828x64x16x2xbf16>
return %pack : tensor<64x1828x64x16x2xbf16>
}
func.func @pack_good(%arg0: tensor<64x29241x128xbf16>) -> tensor<64x1828x64x16x2xbf16> {
%cst = arith.constant 0.000000e+00 : bf16
@Max191
Max191 / pure_unpack.mlir
Created May 21, 2024 15:53
Bad pure unpack codegen compared to unpack + transpose
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
hal.executable private @main$async_dispatch_330 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4",
module attributes {hal.device.targets = [#hal.device.target<"rocm", {legacy_sync}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>]>]} {
hal.executable private @conv_2d_nchw_fchw_dispatch_1 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x242x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.
hal.executable public @conv_2d_nchw_fchw_dispatch_1 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32() {
%cst = arith.constant 0.000000e+00 : f