Benoit Jacob bjacob

## log.mlir
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#executable_target_embedded_elf_arm_64_1 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  hal.executable public @_ba

## trace_execution.txt
[==========] Running 1 test from 1 test suite.
[----------] Global test environment set-up.
[----------] 1 test from module
[ RUN      ] module.batch_matmul_narrow_n_2
[module.__init+00000000]    <block>
[module.__init+00000001]    %i0 = vm.const.i32 527363  // 0x00080C03
[module.__init+00000008]    %i1 = vm.const.i32 48  // 0x00000030
[module.__init+0000000F]    %r0 = vm.const.ref.zero
[module.__init+00000012]    %r1 = vm.const.ref.zero
[module.__init+00000015]    %i2 = vm.const.i32 7  // 0x00000007

## a.mlir
func.func @batch_matmul_narrow_n_2() {
  %lhs = util.unfoldable_constant dense<[[
     [1, 2, 0, 5],
     [3, 4, -1, -3],
     [5, 6, -7, 0]
  ], [
     [-3, 1, 4, 2],
     [-1, 0, 6, -1],
     [1, -2, 3, -4]
  ]]> : tensor<2x3x4xi8>

## log.mlir
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  func.func @batch_matmul_narrow_n_2() {
    %0 = util.unfoldable_constant dense<[[[1, 2, 0, 5], [3, 4, -1, -3], [5, 6, -7, 0]], [[-3, 1, 4, 2], [-1, 0, 6, -1], [1, -2, 3, -4]]]> : tensor<2x3x4xi8>
    %1 = util.unfoldable_constant dense<[[[-2, 0], [3, 1], [4, -1], [-1, 2]], [[1, -2], [2, 3], [-5, -3], [3, 0]]]> : tensor<2x4x2xi8>
    %2 = util.unfoldable_constant dense<[[[1, -1], [2, 0], [3, 1]], [[4, 2], [5, 1], [6, -1]]]> : tensor<2x3x2xi32>
    %3 = linalg.b

## a.mlir
// Source

hal.executable public @main$async_dispatch_331 {
  hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+evex512,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, sanitizer = "address", target_triple = "x86_64-unknown-linux-gnu", ukernels = "mmt4d,pack"}>) {
    hal.executable.export public @main$async_dispatch_331_winograd_input_transform_8x8x1x86x86x256xf32 ord

## a.diff
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
index 769a9c9d15..83874316f8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -97,18 +97,19 @@ static RankedTensorType transposeIfNarrowNResult(RankedTensorType tensorType) {
 static RankedTensorType
 getMaterializedType(RankedTensorType tensorType,
                     MaterializeEncodingFn materializeEncodingFn) {
-  tensorType = transposeIfNarrowNResult(tensorType);
+  RankedTensorType maybeTransposedTensorType =

## a.txt
tools/iree-compile ~/testing/mmt4d_f32f32f32.mlir -o /tmp/mmt4d_f32f32f32.vmfb --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all
tools/iree-compile ~/testing/mmt4d_bf16bf16f32.mlir -o /tmp/mmt4d_bf16bf16f32.vmfb --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all

tools/iree-benchmark-module --module=/tmp/mmt4d_f32f32f32.vmfb --function=mmt4d_f32f32f32 --input=8x512x16x1xf32 --input=8x512x16x1xf32 --input=8x8x16x16xf32 --benchmark_min_warmup_time=1s --benchmark_min_time=1s --task_topology_max_group_count=1
tools/iree-benchmark-module --module=/tmp/mmt4d_bf16bf16f32.vmfb --function=mmt4d_bf16bf16f32 --input=8x512x16x2xbf16 --input=8x512x16x2xbf16 --input=8x8x16x16xf32 --benchmark_min_warmup_time=1 --benchmark_min_time=1s --task_topology_max_group_count=1


----------------------------------------------------------------------------------------------------
Benchmark

## async_dispatch_47.mlir
hal.executable public @main$async_dispatch_47 {
  hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, sanitizer = "address", target_triple = "x86_64-unknown-linux-gnu", ukernels = "mmt4d,pack"}>) {
    hal.executable.export public @main$async_dispatch_47_attention_1x4096x512xf32 ordinal(0) layout(#hal.pipeline.layo

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                bjacob
                / README.md
            
            
              Last active
              May 23, 2024 16:03
            
          
    ASAN diagnostic:
AddressSanitizer:DEADLYSIGNAL
=================================================================
==743857==ERROR: AddressSanitizer: stack-overflow on address 0x75cf6576bd20 (pc 0x75cf37cde536 bp 0x75cf657fe9f0 sp 0x75cf657654c0 T2)
/usr/bin/llvm-symbolizer-18: error: '/tmp/iree_dylib_NUufEG_mem_.so': No such file or directory
    #0 0x75cf37cde536  (/tmp/iree_dylib_NUufEG_mem_.so+0x35536)
    #1 0x654d8b339137 in iree_hal_system_executable_issue_call /home/benoit/iree/runtime/src/iree/hal/local/loaders/system_library_loader.c:331:13
    #2 0x654d8b38f769 in iree_hal_cmd_dispatch_tile /home/benoit/iree/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c:877:26


## README.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                bjacob
                / README.md
            
            
              Last active
              May 22, 2024 13:50
            
          
    Testcase:
func.func @matmul_dynamic(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>, %acc: tensor<?x?xf32>) -> tensor<?x?xf32> {
  %result = linalg.matmul ins(%lhs, %rhs: tensor<?x?xf32>, tensor<?x?xf32>) outs(%acc: tensor<?x?xf32>) -> tensor<?x?xf32>
  return %result: tensor<?x?xf32>
}

Compile:
	// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
	#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
	#executable_target_embedded_elf_arm_64_1 = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "arm64-unknown-unknown-eabi-elf"}>
	#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
	#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_arm_64_]>
	module attributes {hal.device.targets = [#device_target_local]} {
	hal.executable public @_ba
	[==========] Running 1 test from 1 test suite.
	[----------] Global test environment set-up.
	[----------] 1 test from module
	[ RUN ] module.batch_matmul_narrow_n_2
	[module.__init+00000000] <block>
	[module.__init+00000001] %i0 = vm.const.i32 527363 // 0x00080C03
	[module.__init+00000008] %i1 = vm.const.i32 48 // 0x00000030
	[module.__init+0000000F] %r0 = vm.const.ref.zero
	[module.__init+00000012] %r1 = vm.const.ref.zero
	[module.__init+00000015] %i2 = vm.const.i32 7 // 0x00000007
	func.func @batch_matmul_narrow_n_2() {
	%lhs = util.unfoldable_constant dense<[[
	[1, 2, 0, 5],
	[3, 4, -1, -3],
	[5, 6, -7, 0]
	], [
	[-3, 1, 4, 2],
	[-1, 0, 6, -1],
	[1, -2, 3, -4]
	]]> : tensor<2x3x4xi8>
	// Source

	hal.executable public @main$async_dispatch_331 {
	hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+evex512,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : i64, sanitizer = "address", target_triple = "x86_64-unknown-linux-gnu", ukernels = "mmt4d,pack"}>) {
	hal.executable.export public @main$async_dispatch_331_winograd_input_transform_8x8x1x86x86x256xf32 ord
	diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
	index 769a9c9d15..83874316f8 100644
	--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
	+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
	@@ -97,18 +97,19 @@ static RankedTensorType transposeIfNarrowNResult(RankedTensorType tensorType) {
	static RankedTensorType
	getMaterializedType(RankedTensorType tensorType,
	MaterializeEncodingFn materializeEncodingFn) {
	- tensorType = transposeIfNarrowNResult(tensorType);
	+ RankedTensorType maybeTransposedTensorType =
	tools/iree-compile ~/testing/mmt4d_f32f32f32.mlir -o /tmp/mmt4d_f32f32f32.vmfb --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all
	tools/iree-compile ~/testing/mmt4d_bf16bf16f32.mlir -o /tmp/mmt4d_bf16bf16f32.vmfb --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all

	tools/iree-benchmark-module --module=/tmp/mmt4d_f32f32f32.vmfb --function=mmt4d_f32f32f32 --input=8x512x16x1xf32 --input=8x512x16x1xf32 --input=8x8x16x16xf32 --benchmark_min_warmup_time=1s --benchmark_min_time=1s --task_topology_max_group_count=1
	tools/iree-benchmark-module --module=/tmp/mmt4d_bf16bf16f32.vmfb --function=mmt4d_bf16bf16f32 --input=8x512x16x2xbf16 --input=8x512x16x2xbf16 --input=8x8x16x16xf32 --benchmark_min_warmup_time=1 --benchmark_min_time=1s --task_topology_max_group_count=1


	----------------------------------------------------------------------------------------------------
	Benchmark