Created
May 21, 2024 15:53
-
-
Save Max191/2d6a74f4f7be1951ac359b6fd8db60ca to your computer and use it in GitHub Desktop.
Bad pure unpack codegen compared to unpack + transpose
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} { | |
hal.executable private @main$async_dispatch_330 { | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>) { | |
hal.executable.export public @main$async_dispatch_330_unpack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_330_unpack_f32() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} { | |
%0 = hal.interface.constant.load[0] : i32 | |
%1 = hal.interface.constant.load[1] : i32 | |
%2 = arith.index_castui %0 : i32 to index | |
%3 = arith.index_castui %1 : i32 to index | |
%4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> | |
%5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>> | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0], sizes = [64, 1828, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> -> tensor<64x1828x8x16x16xf32> | |
%7 = tensor.empty() : tensor<29241x128x64xf32> | |
%unpack = tensor.unpack %6 outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 1]]>} : tensor<64x1828x8x16x16xf32> -> tensor<29241x128x64xf32> | |
flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0, 0], sizes = [29241, 128, 64], strides = [1, 1, 1] : tensor<29241x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>> | |
return | |
} | |
} | |
} | |
} | |
util.global private mutable @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer | |
util.initializer { | |
%c9583067136 = arith.constant 9583067136 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c9583067136} | |
util.global.store %buffer, @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer | |
util.return | |
} | |
util.func public @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} { | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1 = arith.constant 1 : index | |
%c4791533568 = arith.constant 4791533568 : index | |
%c-1783627776_i32 = arith.constant -1783627776 : i32 | |
%c1552941056_i32 = arith.constant 1552941056 : i32 | |
%c0 = arith.constant 0 : index | |
%0 = arith.index_cast %arg0 : i32 to index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer | |
%pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout | |
hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c1552941056_i32, %c-1783627776_i32]) : i32, i32 | |
%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer = util.global.load @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer | |
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c0, %c4791533568], | |
%c1 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c4791533568, %c4791533568] | |
]) | |
%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index, index, index | |
%exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main$async_dispatch_330) : !hal.executable | |
%ordinal = hal.executable.export.ordinal target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index | |
scf.for %arg1 = %c0 to %0 step %c1 { | |
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z]) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None") | |
} | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%1 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
util.return | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} { | |
hal.executable private @main$async_dispatch_330 { | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>) { | |
hal.executable.export public @main$async_dispatch_330_unpack_transpose_29241x128x64_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main$async_dispatch_330_unpack_transpose_29241x128x64_f32() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} { | |
%0 = hal.interface.constant.load[0] : i32 | |
%1 = hal.interface.constant.load[1] : i32 | |
%2 = arith.index_castui %0 : i32 to index | |
%3 = arith.index_castui %1 : i32 to index | |
%4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> | |
%5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>> | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0], sizes = [64, 1828, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> -> tensor<64x1828x8x16x16xf32> | |
%7 = tensor.empty() : tensor<29241x128x64xf32> | |
%8 = tensor.empty() : tensor<64x29241x128xf32> | |
%unpack = tensor.unpack %6 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %8 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<64x1828x8x16x16xf32> -> tensor<64x29241x128xf32> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack : tensor<64x29241x128xf32>) outs(%7 : tensor<29241x128x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 16], [0, 0, 0], [0, 0, 0]]>} { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} -> tensor<29241x128x64xf32> | |
flow.dispatch.tensor.store %9, %5, offsets = [0, 0, 0], sizes = [29241, 128, 64], strides = [1, 1, 1] : tensor<29241x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>> | |
return | |
} | |
} | |
} | |
} | |
util.global private mutable @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer | |
util.initializer { | |
%c9583067136 = arith.constant 9583067136 : index | |
%c-1_i64 = arith.constant -1 : i64 | |
%c0 = arith.constant 0 : index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator | |
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c9583067136} | |
util.global.store %buffer, @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer | |
util.return | |
} | |
util.func public @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} { | |
%c-1_i32 = arith.constant -1 : i32 | |
%c-1_i64 = arith.constant -1 : i64 | |
%c1 = arith.constant 1 : index | |
%c4791533568 = arith.constant 4791533568 : index | |
%c-1783627776_i32 = arith.constant -1783627776 : i32 | |
%c1552941056_i32 = arith.constant 1552941056 : i32 | |
%c0 = arith.constant 0 : index | |
%0 = arith.index_cast %arg0 : i32 to index | |
%device_0 = hal.devices.get %c0 : !hal.device | |
%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer | |
%pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout | |
hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c1552941056_i32, %c-1783627776_i32]) : i32, i32 | |
%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer = util.global.load @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer | |
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([ | |
%c0 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer)[%c0, %c4791533568], | |
%c1 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer)[%c4791533568, %c4791533568] | |
]) | |
%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_transpose_29241x128x64_f32) : index, index, index | |
%exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main$async_dispatch_330) : !hal.executable | |
%ordinal = hal.executable.export.ordinal target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_transpose_29241x128x64_f32) : index | |
scf.for %arg1 = %c0 to %0 step %c1 { | |
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z]) | |
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None") | |
} | |
hal.command_buffer.finalize<%cmd : !hal.command_buffer> | |
%1 = util.null : !hal.fence | |
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence | |
hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd]) | |
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 | |
util.status.check_ok %status, "failed to wait on timepoint" | |
util.return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The
unpack_transpose.mlir
results in a faster benchmark, so there is likely some inefficient codegen for thepure_pack.mlir
case. One thing to note is that the vector sizes for the unpack + transpose are[16, 16, 16]
, while the pure unpack vector sizes are[16, 16, 1]
. This may be the root of the inefficiency.