Benoit Jacob bjacob

## a.diff
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 002c34f425..e4dd03268f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1506,7 +1506,8 @@ static void setX86VectorTileSizes(linalg::GenericOp genericOp,
                                   SmallVectorImpl<int64_t> &vecTileSizes) {
   vecTileSizes.append(numLoops, 0);
   SmallVector<int64_t> staticLoopRanges = genericOp.getStaticLoopRanges();
-  for (auto loopNum : llvm::seq<unsigned>(0, numLoops)) {
+  int64_t maxRemainingVecTileSize = 1024;

## turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack.s
	.section	.text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,"ax",@progbits
	.p2align	4, 0x90
	.type	turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,@function
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack:
.Lfunc_begin1:
	.loc	1 1 0 is_stmt 1
	.cfi_startproc
	push	rbp
	.cfi_def_cfa_offset 16
	.cfi_offset rbp, -16

## debug.mlir
Args: tools/iree-compile --iree-llvmcpu-link-embedded=false --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all /home/benoit/matmul_n4_i8.mlir -o /tmp/a.vmfb --iree-hal-dump-executable-intermediates-to=/tmp -mlir-disable-threading -mlir-print-ir-before-all -mlir-print-ir-after-all -debug
Load new dialect in Context builtin
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::DistinctAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::BytecodeOpInterface)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolOpInterface)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpAsmOpInterface)

## syslog.txt
2024-03-20T09:54:17.450726-04:00 hocher kernel: [    4.223485] [drm] amdgpu kernel modesetting enabled.
2024-03-20T09:54:17.450726-04:00 hocher kernel: [    4.223605] amdgpu: CRAT table disabled by module option
2024-03-20T09:54:17.450727-04:00 hocher kernel: [    4.223607] amdgpu: Virtual CRAT table created for CPU
2024-03-20T09:54:17.450727-04:00 hocher kernel: [    4.223616] amdgpu: Topology: Add CPU node
2024-03-20T09:54:17.450727-04:00 hocher kernel: [    4.223695] amdgpu 0000:03:00.0: enabling device (0006 -> 0007)
2024-03-20T09:54:17.450726-04:00 hocher sbkeysync[1334]:     e36dfc719d2114c2e39aea88849e2845ab326f6f7fe74e0e539b7e54d81f3631
2024-03-20T09:54:17.450727-04:00 hocher kernel: [    4.223756] [drm] initializing kernel modesetting (VEGA10 0x1002:0x687F 0x1462:0x3681 0xC3).
2024-03-20T09:54:17.450728-04:00 hocher kernel: [    4.223764] [drm] register mmio base: 0xF6C00000
2024-03-20T09:54:17.450728-04:00 hocher kernel: [    4.223765] [drm] register mmio size: 524288
2024-03-20T09:54:17.450728-04:0

## gist:31fdeb0902a42461d5cca6902aef6080
NameLoc: main
NameLoc: MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6

## a.diff
diff --git a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc b/tmp/BuiltinDialectBytecode.cpp.inc
index 26072dbe9c8f..015598623db4 100644
--- a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc
+++ b/tmp/BuiltinDialectBytecode.cpp.inc
@@ -355,6 +355,27 @@ static void write(UnknownLoc attribute, DialectBytecodeWriter &writer) {
   writer.writeVarInt(/* UnknownLoc */ 15);
 }

+namespace {
+struct Logger {

## e2e_matmul_cpu_dt_uk_i8_i32_small_llvm-cpu_local-task_calls.mlir
builtin.module @calls attributes {

} {

func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)

func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
func.func private @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                bjacob
                / README.md
            
            
              Last active
              March 18, 2024 13:48
            
              
                Download, compile and run OPT-1.3b on CPU with IREE
              
          
    Download

Download the MLIR model code without parameters (so that's lightweight and will compile fast):
https://storage.googleapis.com/shark_tank/elias/facebook_opt_1.3b.mlir
Download the parameters:
https://storage.googleapis.com/shark_tank/elias/facebook_opt_1.3b_weights.irpa
The below command lines assumes that these have been downloaded under $HOME/testing.

  
## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                bjacob
                / README.md
            
            
              Created
              February 21, 2024 20:48
            
          
    Trying to change ukernels calling convention back to "default", instead of ParameterStruct.
Problem: they are returning void, which leads to an assertion failure in ConvertToLLVM due to a discrepancy in how void llvm.call is represented, either returning nothing or returning one value of type !llvm.void
Getting this:
iree-compile: /home/benoit/iree/third_party/llvm-project/mlir/lib/IR/PatternMatch.cpp:153: virtual void mlir::RewriterBase::replaceOp(Operation *, ValueRange): Assertion `op->getNumResults() == newValues.size() && "incorrect # of replacement values"' failed.

Thread 7 "llvm-worker-5" received signal SIGABRT, Aborted.
[Switching to Thread 0x7fffcf7fe6c0 (LWP 259929)]


## README.md

      
              4 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                bjacob
                / README.md
            
            
              Last active
              February 16, 2024 20:30
            
              
                Attempt at ukernel fallback to codegen
              
          
    Attempt at ukernel fallback to codegen

This is to document a short-lived attempt at solving #15784 by implementing the idea laid out in the original issue description. This changes the mmt4 ukernel to return a second return value which is a status code, and changes the mmt4d-to-ukernel lowering to create a scf.if based on that status code:
%62:2 = iree_codegen.ukernel.generic "iree_uk_mmt4d" ins(%59, %60 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%61 : tensor<1x1x16x16xf32>) (%c1, %c1, %dim, %c16_i32, %c16_i32, %c1_i32, %c1281_i32 : index, index, index, i32, i32, i32, i32) fn_def_attrs {hal.import.bitcode = true, hal.import.cconv = 1 : i32, hal.import.fields = ["processor_data"]} strided_outer_dims(1) -> tensor<1x1x16x16xf32>, i32
%63 = arith.cmpi eq, %62#1, %c0_i32 : i32
%64 = scf.if %63 -> (tensor<1x1x16x16xf32>) {
  scf.yield %62#0 : tensor<1x1x16x16xf32>
} else {
	diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
	index 002c34f425..e4dd03268f 100644
	--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
	+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
	@@ -1506,7 +1506,8 @@ static void setX86VectorTileSizes(linalg::GenericOp genericOp,
	SmallVectorImpl<int64_t> &vecTileSizes) {
	vecTileSizes.append(numLoops, 0);
	SmallVector<int64_t> staticLoopRanges = genericOp.getStaticLoopRanges();
	- for (auto loopNum : llvm::seq<unsigned>(0, numLoops)) {
	+ int64_t maxRemainingVecTileSize = 1024;
	.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,"ax",@progbits
	.p2align 4, 0x90
	.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,@function
	turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack:
	.Lfunc_begin1:
	.loc 1 1 0 is_stmt 1
	.cfi_startproc
	push rbp
	.cfi_def_cfa_offset 16
	.cfi_offset rbp, -16
	Args: tools/iree-compile --iree-llvmcpu-link-embedded=false --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all /home/benoit/matmul_n4_i8.mlir -o /tmp/a.vmfb --iree-hal-dump-executable-intermediates-to=/tmp -mlir-disable-threading -mlir-print-ir-before-all -mlir-print-ir-after-all -debug
	Load new dialect in Context builtin
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::DistinctAttr)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::BytecodeOpInterface)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolOpInterface)
	ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpAsmOpInterface)
	2024-03-20T09:54:17.450726-04:00 hocher kernel: [ 4.223485] [drm] amdgpu kernel modesetting enabled.
	2024-03-20T09:54:17.450726-04:00 hocher kernel: [ 4.223605] amdgpu: CRAT table disabled by module option
	2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223607] amdgpu: Virtual CRAT table created for CPU
	2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223616] amdgpu: Topology: Add CPU node
	2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223695] amdgpu 0000:03:00.0: enabling device (0006 -> 0007)
	2024-03-20T09:54:17.450726-04:00 hocher sbkeysync[1334]: e36dfc719d2114c2e39aea88849e2845ab326f6f7fe74e0e539b7e54d81f3631
	2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223756] [drm] initializing kernel modesetting (VEGA10 0x1002:0x687F 0x1462:0x3681 0xC3).
	2024-03-20T09:54:17.450728-04:00 hocher kernel: [ 4.223764] [drm] register mmio base: 0xF6C00000
	2024-03-20T09:54:17.450728-04:00 hocher kernel: [ 4.223765] [drm] register mmio size: 524288
	2024-03-20T09:54:17.450728-04:0
	NameLoc: main
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6
	NameLoc: MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6
	diff --git a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc b/tmp/BuiltinDialectBytecode.cpp.inc
	index 26072dbe9c8f..015598623db4 100644
	--- a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc
	+++ b/tmp/BuiltinDialectBytecode.cpp.inc
	@@ -355,6 +355,27 @@ static void write(UnknownLoc attribute, DialectBytecodeWriter &writer) {
	writer.writeVarInt(/* UnknownLoc */ 15);
	}

	+namespace {
	+struct Logger {
	builtin.module @calls attributes {

	} {

	func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
	func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)

	func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
	func.func private @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
	func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view