Skip to content

Instantly share code, notes, and snippets.

@bjacob
bjacob / a.diff
Last active April 18, 2024 20:08
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 002c34f425..e4dd03268f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1506,7 +1506,8 @@ static void setX86VectorTileSizes(linalg::GenericOp genericOp,
SmallVectorImpl<int64_t> &vecTileSizes) {
vecTileSizes.append(numLoops, 0);
SmallVector<int64_t> staticLoopRanges = genericOp.getStaticLoopRanges();
- for (auto loopNum : llvm::seq<unsigned>(0, numLoops)) {
+ int64_t maxRemainingVecTileSize = 1024;
@bjacob
bjacob / turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack.s
Created April 12, 2024 21:11
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack disassembly
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,"ax",@progbits
.p2align 4, 0x90
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,@function
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack:
.Lfunc_begin1:
.loc 1 1 0 is_stmt 1
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset rbp, -16
Args: tools/iree-compile --iree-llvmcpu-link-embedded=false --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all /home/benoit/matmul_n4_i8.mlir -o /tmp/a.vmfb --iree-hal-dump-executable-intermediates-to=/tmp -mlir-disable-threading -mlir-print-ir-before-all -mlir-print-ir-after-all -debug
Load new dialect in Context builtin
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::DistinctAttr)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::BytecodeOpInterface)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolOpInterface)
ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpAsmOpInterface)
2024-03-20T09:54:17.450726-04:00 hocher kernel: [ 4.223485] [drm] amdgpu kernel modesetting enabled.
2024-03-20T09:54:17.450726-04:00 hocher kernel: [ 4.223605] amdgpu: CRAT table disabled by module option
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223607] amdgpu: Virtual CRAT table created for CPU
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223616] amdgpu: Topology: Add CPU node
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223695] amdgpu 0000:03:00.0: enabling device (0006 -> 0007)
2024-03-20T09:54:17.450726-04:00 hocher sbkeysync[1334]: e36dfc719d2114c2e39aea88849e2845ab326f6f7fe74e0e539b7e54d81f3631
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223756] [drm] initializing kernel modesetting (VEGA10 0x1002:0x687F 0x1462:0x3681 0xC3).
2024-03-20T09:54:17.450728-04:00 hocher kernel: [ 4.223764] [drm] register mmio base: 0xF6C00000
2024-03-20T09:54:17.450728-04:00 hocher kernel: [ 4.223765] [drm] register mmio size: 524288
2024-03-20T09:54:17.450728-04:0
NameLoc: main
NameLoc: MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6
NameLoc: MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6
@bjacob
bjacob / a.diff
Last active March 18, 2024 18:37
diff --git a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc b/tmp/BuiltinDialectBytecode.cpp.inc
index 26072dbe9c8f..015598623db4 100644
--- a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc
+++ b/tmp/BuiltinDialectBytecode.cpp.inc
@@ -355,6 +355,27 @@ static void write(UnknownLoc attribute, DialectBytecodeWriter &writer) {
writer.writeVarInt(/* UnknownLoc */ 15);
}
+namespace {
+struct Logger {
builtin.module @calls attributes {
} {
func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
func.func private @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
@bjacob
bjacob / README.md
Last active March 18, 2024 13:48
Download, compile and run OPT-1.3b on CPU with IREE

Trying to change ukernels calling convention back to "default", instead of ParameterStruct.

Problem: they are returning void, which leads to an assertion failure in ConvertToLLVM due to a discrepancy in how void llvm.call is represented, either returning nothing or returning one value of type !llvm.void

Getting this:

iree-compile: /home/benoit/iree/third_party/llvm-project/mlir/lib/IR/PatternMatch.cpp:153: virtual void mlir::RewriterBase::replaceOp(Operation *, ValueRange): Assertion `op->getNumResults() == newValues.size() && "incorrect # of replacement values"' failed.

Thread 7 "llvm-worker-5" received signal SIGABRT, Aborted.
[Switching to Thread 0x7fffcf7fe6c0 (LWP 259929)]
@bjacob
bjacob / README.md
Last active February 16, 2024 20:30
Attempt at ukernel fallback to codegen

Attempt at ukernel fallback to codegen

This is to document a short-lived attempt at solving #15784 by implementing the idea laid out in the original issue description. This changes the mmt4 ukernel to return a second return value which is a status code, and changes the mmt4d-to-ukernel lowering to create a scf.if based on that status code:

%62:2 = iree_codegen.ukernel.generic "iree_uk_mmt4d" ins(%59, %60 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%61 : tensor<1x1x16x16xf32>) (%c1, %c1, %dim, %c16_i32, %c16_i32, %c1_i32, %c1281_i32 : index, index, index, i32, i32, i32, i32) fn_def_attrs {hal.import.bitcode = true, hal.import.cconv = 1 : i32, hal.import.fields = ["processor_data"]} strided_outer_dims(1) -> tensor<1x1x16x16xf32>, i32
%63 = arith.cmpi eq, %62#1, %c0_i32 : i32
%64 = scf.if %63 -> (tensor<1x1x16x16xf32>) {
  scf.yield %62#0 : tensor<1x1x16x16xf32>
} else {