Skip to content

Instantly share code, notes, and snippets.

This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#executable_target_embedded_elf_arm_64_1 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
hal.executable public @_ba
[==========] Running 1 test from 1 test suite.
[----------] Global test environment set-up.
[----------] 1 test from module
[ RUN ] module.batch_matmul_narrow_n_2
[module.__init+00000000] <block>
[module.__init+00000001] %i0 = vm.const.i32 527363 // 0x00080C03
[module.__init+00000008] %i1 = vm.const.i32 48 // 0x00000030
[module.__init+0000000F] %r0 = vm.const.ref.zero
[module.__init+00000012] %r1 = vm.const.ref.zero
[module.__init+00000015] %i2 = vm.const.i32 7 // 0x00000007
func.func @batch_matmul_narrow_n_2() {
%lhs = util.unfoldable_constant dense<[[
[1, 2, 0, 5],
[3, 4, -1, -3],
[5, 6, -7, 0]
], [
[-3, 1, 4, 2],
[-1, 0, 6, -1],
[1, -2, 3, -4]
]]> : tensor<2x3x4xi8>
This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
func.func @batch_matmul_narrow_n_2() {
%0 = util.unfoldable_constant dense<[[[1, 2, 0, 5], [3, 4, -1, -3], [5, 6, -7, 0]], [[-3, 1, 4, 2], [-1, 0, 6, -1], [1, -2, 3, -4]]]> : tensor<2x3x4xi8>
%1 = util.unfoldable_constant dense<[[[-2, 0], [3, 1], [4, -1], [-1, 2]], [[1, -2], [2, 3], [-5, -3], [3, 0]]]> : tensor<2x4x2xi8>
%2 = util.unfoldable_constant dense<[[[1, -1], [2, 0], [3, 1]], [[4, 2], [5, 1], [6, -1]]]> : tensor<2x3x2xi32>
%3 = linalg.b
// Source
hal.executable public @main$async_dispatch_331 {
hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+evex512,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, sanitizer = "address", target_triple = "x86_64-unknown-linux-gnu", ukernels = "mmt4d,pack"}>) {
hal.executable.export public @main$async_dispatch_331_winograd_input_transform_8x8x1x86x86x256xf32 ord
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
index 769a9c9d15..83874316f8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -97,18 +97,19 @@ static RankedTensorType transposeIfNarrowNResult(RankedTensorType tensorType) {
static RankedTensorType
getMaterializedType(RankedTensorType tensorType,
MaterializeEncodingFn materializeEncodingFn) {
- tensorType = transposeIfNarrowNResult(tensorType);
+ RankedTensorType maybeTransposedTensorType =
tools/iree-compile ~/testing/mmt4d_f32f32f32.mlir -o /tmp/mmt4d_f32f32f32.vmfb --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all
tools/iree-compile ~/testing/mmt4d_bf16bf16f32.mlir -o /tmp/mmt4d_bf16bf16f32.vmfb --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all
tools/iree-benchmark-module --module=/tmp/mmt4d_f32f32f32.vmfb --function=mmt4d_f32f32f32 --input=8x512x16x1xf32 --input=8x512x16x1xf32 --input=8x8x16x16xf32 --benchmark_min_warmup_time=1s --benchmark_min_time=1s --task_topology_max_group_count=1
tools/iree-benchmark-module --module=/tmp/mmt4d_bf16bf16f32.vmfb --function=mmt4d_bf16bf16f32 --input=8x512x16x2xbf16 --input=8x512x16x2xbf16 --input=8x8x16x16xf32 --benchmark_min_warmup_time=1 --benchmark_min_time=1s --task_topology_max_group_count=1
----------------------------------------------------------------------------------------------------
Benchmark
hal.executable public @main$async_dispatch_47 {
hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, sanitizer = "address", target_triple = "x86_64-unknown-linux-gnu", ukernels = "mmt4d,pack"}>) {
hal.executable.export public @main$async_dispatch_47_attention_1x4096x512xf32 ordinal(0) layout(#hal.pipeline.layo

ASAN diagnostic:

AddressSanitizer:DEADLYSIGNAL
=================================================================
==743857==ERROR: AddressSanitizer: stack-overflow on address 0x75cf6576bd20 (pc 0x75cf37cde536 bp 0x75cf657fe9f0 sp 0x75cf657654c0 T2)
/usr/bin/llvm-symbolizer-18: error: '/tmp/iree_dylib_NUufEG_mem_.so': No such file or directory
    #0 0x75cf37cde536  (/tmp/iree_dylib_NUufEG_mem_.so+0x35536)
    #1 0x654d8b339137 in iree_hal_system_executable_issue_call /home/benoit/iree/runtime/src/iree/hal/local/loaders/system_library_loader.c:331:13
    #2 0x654d8b38f769 in iree_hal_cmd_dispatch_tile /home/benoit/iree/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c:877:26

Testcase:

func.func @matmul_dynamic(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
  return %result: tensor<?x?xf32>
}

Compile: