Repro:
tools/iree-opt /tmp/a.mlir --pass-pipeline="builtin.module(func.func(iree-codegen-convert-to-destination-passing-style),canonicalize,cse)"
Result:
The tensor.pack
output-operand is no longer empty
.
Repro:
tools/iree-opt /tmp/a.mlir --pass-pipeline="builtin.module(func.func(iree-codegen-convert-to-destination-passing-style),canonicalize,cse)"
Result:
The tensor.pack
output-operand is no longer empty
.
This tutorial is simultaneously about IREE, MLIR, and specifically the MLIR Linalg dialect.
MLIR is a programming language, but MLIR in itself is almost just an empty shell. What it really provides is a framework allowing to define MLIR dialects which are where the features come from.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp | |
index 002c34f425..e4dd03268f 100644 | |
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp | |
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp | |
@@ -1506,7 +1506,8 @@ static void setX86VectorTileSizes(linalg::GenericOp genericOp, | |
SmallVectorImpl<int64_t> &vecTileSizes) { | |
vecTileSizes.append(numLoops, 0); | |
SmallVector<int64_t> staticLoopRanges = genericOp.getStaticLoopRanges(); | |
- for (auto loopNum : llvm::seq<unsigned>(0, numLoops)) { | |
+ int64_t maxRemainingVecTileSize = 1024; |
.section .text.turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,"ax",@progbits | |
.p2align 4, 0x90 | |
.type turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack,@function | |
turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_generic_Dx8640x3200_f16_pack: | |
.Lfunc_begin1: | |
.loc 1 1 0 is_stmt 1 | |
.cfi_startproc | |
push rbp | |
.cfi_def_cfa_offset 16 | |
.cfi_offset rbp, -16 |
Args: tools/iree-compile --iree-llvmcpu-link-embedded=false --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=znver4 --iree-llvmcpu-enable-ukernels=all /home/benoit/matmul_n4_i8.mlir -o /tmp/a.vmfb --iree-hal-dump-executable-intermediates-to=/tmp -mlir-disable-threading -mlir-print-ir-before-all -mlir-print-ir-after-all -debug | |
Load new dialect in Context builtin | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ShapedType) | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::MemRefLayoutAttrInterface) | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::TypedAttr) | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::ElementsAttr) | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::DistinctAttr) | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::BytecodeOpInterface) | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::SymbolOpInterface) | |
ImplicitTypeIDRegistry::lookupOrInsert(mlir::OpAsmOpInterface) |
2024-03-20T09:54:17.450726-04:00 hocher kernel: [ 4.223485] [drm] amdgpu kernel modesetting enabled. | |
2024-03-20T09:54:17.450726-04:00 hocher kernel: [ 4.223605] amdgpu: CRAT table disabled by module option | |
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223607] amdgpu: Virtual CRAT table created for CPU | |
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223616] amdgpu: Topology: Add CPU node | |
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223695] amdgpu 0000:03:00.0: enabling device (0006 -> 0007) | |
2024-03-20T09:54:17.450726-04:00 hocher sbkeysync[1334]: e36dfc719d2114c2e39aea88849e2845ab326f6f7fe74e0e539b7e54d81f3631 | |
2024-03-20T09:54:17.450727-04:00 hocher kernel: [ 4.223756] [drm] initializing kernel modesetting (VEGA10 0x1002:0x687F 0x1462:0x3681 0xC3). | |
2024-03-20T09:54:17.450728-04:00 hocher kernel: [ 4.223764] [drm] register mmio base: 0xF6C00000 | |
2024-03-20T09:54:17.450728-04:00 hocher kernel: [ 4.223765] [drm] register mmio size: 524288 | |
2024-03-20T09:54:17.450728-04:0 |
NameLoc: main | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6 | |
NameLoc: MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6 |
diff --git a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc b/tmp/BuiltinDialectBytecode.cpp.inc | |
index 26072dbe9c8f..015598623db4 100644 | |
--- a/home/benoit/iree-build/llvm-project/tools/mlir/include/mlir/IR/BuiltinDialectBytecode.cpp.inc | |
+++ b/tmp/BuiltinDialectBytecode.cpp.inc | |
@@ -355,6 +355,27 @@ static void write(UnknownLoc attribute, DialectBytecodeWriter &writer) { | |
writer.writeVarInt(/* UnknownLoc */ 15); | |
} | |
+namespace { | |
+struct Logger { |
Download the MLIR model code without parameters (so that's lightweight and will compile fast): https://storage.googleapis.com/shark_tank/elias/facebook_opt_1.3b.mlir
Download the parameters: https://storage.googleapis.com/shark_tank/elias/facebook_opt_1.3b_weights.irpa
The below command lines assumes that these have been downloaded under $HOME/testing
.
builtin.module @calls attributes { | |
} { | |
func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view | |
func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view) | |
func.func private @module.matmul_accumulate_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view | |
func.func private @module.matmul_accumulate_1x1xi8_times_1x1xi8_into_1x1xi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view | |
func.func private @module.matmul_DYNxDYNxi8_times_DYNxDYNxi8_into_DYNxDYNxi32(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view |