Skip to content

Instantly share code, notes, and snippets.

@Max191
Created May 21, 2024 15:53
Show Gist options
  • Save Max191/2d6a74f4f7be1951ac359b6fd8db60ca to your computer and use it in GitHub Desktop.
Save Max191/2d6a74f4f7be1951ac359b6fd8db60ca to your computer and use it in GitHub Desktop.
Bad pure unpack codegen compared to unpack + transpose
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
hal.executable private @main$async_dispatch_330 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>) {
hal.executable.export public @main$async_dispatch_330_unpack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main$async_dispatch_330_unpack_f32() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = arith.index_castui %0 : i32 to index
%3 = arith.index_castui %1 : i32 to index
%4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>>
%5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0], sizes = [64, 1828, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> -> tensor<64x1828x8x16x16xf32>
%7 = tensor.empty() : tensor<29241x128x64xf32>
%unpack = tensor.unpack %6 outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 1]]>} : tensor<64x1828x8x16x16xf32> -> tensor<29241x128x64xf32>
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0, 0], sizes = [29241, 128, 64], strides = [1, 1, 1] : tensor<29241x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
return
}
}
}
}
util.global private mutable @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
util.initializer {
%c9583067136 = arith.constant 9583067136 : index
%c-1_i64 = arith.constant -1 : i64
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c9583067136}
util.global.store %buffer, @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
util.return
}
util.func public @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1 = arith.constant 1 : index
%c4791533568 = arith.constant 4791533568 : index
%c-1783627776_i32 = arith.constant -1783627776 : i32
%c1552941056_i32 = arith.constant 1552941056 : i32
%c0 = arith.constant 0 : index
%0 = arith.index_cast %arg0 : i32 to index
%device_0 = hal.devices.get %c0 : !hal.device
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
%pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c1552941056_i32, %c-1783627776_i32]) : i32, i32
%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer = util.global.load @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c0, %c4791533568],
%c1 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c4791533568, %c4791533568]
])
%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index, index, index
%exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main$async_dispatch_330) : !hal.executable
%ordinal = hal.executable.export.ordinal target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index
scf.for %arg1 = %c0 to %0 step %c1 {
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
}
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%1 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
util.return
}
}
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
hal.executable private @main$async_dispatch_330 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>) {
hal.executable.export public @main$async_dispatch_330_unpack_transpose_29241x128x64_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main$async_dispatch_330_unpack_transpose_29241x128x64_f32() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = arith.index_castui %0 : i32 to index
%3 = arith.index_castui %1 : i32 to index
%4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>>
%5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0], sizes = [64, 1828, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> -> tensor<64x1828x8x16x16xf32>
%7 = tensor.empty() : tensor<29241x128x64xf32>
%8 = tensor.empty() : tensor<64x29241x128xf32>
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %8 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<64x1828x8x16x16xf32> -> tensor<64x29241x128xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack : tensor<64x29241x128xf32>) outs(%7 : tensor<29241x128x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<29241x128x64xf32>
flow.dispatch.tensor.store %9, %5, offsets = [0, 0, 0], sizes = [29241, 128, 64], strides = [1, 1, 1] : tensor<29241x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
return
}
}
}
}
util.global private mutable @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer
util.initializer {
%c9583067136 = arith.constant 9583067136 : index
%c-1_i64 = arith.constant -1 : i64
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c9583067136}
util.global.store %buffer, @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer
util.return
}
util.func public @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c1 = arith.constant 1 : index
%c4791533568 = arith.constant 4791533568 : index
%c-1783627776_i32 = arith.constant -1783627776 : i32
%c1552941056_i32 = arith.constant 1552941056 : i32
%c0 = arith.constant 0 : index
%0 = arith.index_cast %arg0 : i32 to index
%device_0 = hal.devices.get %c0 : !hal.device
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
%pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c1552941056_i32, %c-1783627776_i32]) : i32, i32
%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer = util.global.load @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer)[%c0, %c4791533568],
%c1 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer)[%c4791533568, %c4791533568]
])
%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_transpose_29241x128x64_f32) : index, index, index
%exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main$async_dispatch_330) : !hal.executable
%ordinal = hal.executable.export.ordinal target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_transpose_29241x128x64_f32) : index
scf.for %arg1 = %c0 to %0 step %c1 {
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
}
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%1 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
util.return
}
}
@Max191
Copy link
Author

Max191 commented May 21, 2024

The unpack_transpose.mlir results in a faster benchmark, so there is likely some inefficient codegen for the pure_pack.mlir case. One thing to note is that the vector sizes for the unpack + transpose are [16, 16, 16], while the pure unpack vector sizes are [16, 16, 1]. This may be the root of the inefficiency.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment