Andrzej Warzyński banach-space

## unpack-dynamic-inner-tile-auto.mlir
// DEFINE: %{compile} =  mlir-opt %s \
// DEFINE:  -transform-interpreter -test-transform-dialect-erase-schedule \
// DEFINE:  --lower-vector-mask |\
// DEFINE: mlir-opt \
// DEFINE:  -test-lower-to-llvm -o %t
// DEFINE: %{entry_point} = main
// DEFINE: %{run} = mlir-runner %t -e %{entry_point} -entry-point-result=void \
// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_runner_utils

// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s

## fixed.mlir
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
#map = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
func.func @pack_sve_prod() {
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %vscale = vector.vscale
    %c8_vscale = arith.muli %vscale, %c8 : index
    %0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]

## aarch64_fixed_width.mlir
// Pack + unpack Ops just before vectorization
// NOTE: There are no linalg.pack Ops

// Use -mlir-print-ir-before=iree-codegen-generic-vectorization with iree-compile

%unpack = linalg.unpack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0], [8, 8], [0, 0], [0, 0]]>} : tensor<1x1x8x8xf32> -> tensor<8x8xf32>

## mmt4d_mask.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                banach-space
                / mmt4d_mask.md
            
            
              Last active
              June 12, 2025 15:55
            
          
    High-level problem description

We want to compile linalg.mmt4d via "scalable" vectorisation:
%out = linalg.mmt4d ins(%lhs, %rhs: tensor<2x2x4x8xi8>, tensor<?x2x?x8xi8>) 
                   outs(%acc: tensor<2x?x4x?xi32>) -> tensor<2x?x4x?xi32>


## file.py
import torch
import torch.nn as nn

class GatherModel(nn.Module):
    def __init__(self):
        super(GatherModel, self).__init__()

    def forward(self, x):
        print("Input Tensor:")
        print(x)

## before_after.mlir
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x10x20x1x1x9_i32() {
  %c0_i32 = arith.constant 0 : i32
  %c10 = arith.constant 10 : index
  %c20 = arith.constant 20 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c2 = arith.constant 2 : index
  %c5 = arith.constant 5 : index
  %c3 = arith.constant 3 : index

## matmul.mlir
// The following matmul maps very nicely onto SME with SVL=128bits. For f32, there are 4 4x4 tiles,
// that can be assembled as a 8x8 matrix.

// %mat_A_tr - transpose of A
func.func @matmul(%mat_A_tr: memref<6x8xf32>, %mat_B: memref<6x8xf32>, %mat_C: memref<8x8xf32>) {
  linalg.matmul ins(%mat_A_tr, %mat_B: memref<6x8xf32>, memref<6x8xf32>)
            outs(%mat_C: memref<8x8xf32>)
  return
}

## log_conv_pad_flag_downstream.mlir
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-unknown-unknown-eabi-elf"}> {
  hal.executable.export public @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
    %c30 = arith.constant 30 : index
    %c18 = arith.constant 18 : index
    %c1 = arith.constant 1 : index
    hal.return %c30, %c18, %c1 : index, index, index
	// DEFINE: %{compile} = mlir-opt %s \
	// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
	// DEFINE: --lower-vector-mask \|\
	// DEFINE: mlir-opt \
	// DEFINE: -test-lower-to-llvm -o %t
	// DEFINE: %{entry_point} = main
	// DEFINE: %{run} = mlir-runner %t -e %{entry_point} -entry-point-result=void \
	// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_runner_utils

	// RUN: rm -f %t && %{compile} && %{run} \| FileCheck %s
	#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
	#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
	#map = affine_map<(d0, d1) -> (d0, d1)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
	func.func @pack_sve_prod() {
	%c0 = arith.constant 0 : index
	%c8 = arith.constant 8 : index
	%vscale = vector.vscale
	%c8_vscale = arith.muli %vscale, %c8 : index
	%0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]
	// Pack + unpack Ops just before vectorization
	// NOTE: There are no linalg.pack Ops

	// Use -mlir-print-ir-before=iree-codegen-generic-vectorization with iree-compile

	%unpack = linalg.unpack %extracted_slice outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0], [8, 8], [0, 0], [0, 0]]>} : tensor<1x1x8x8xf32> -> tensor<8x8xf32>
	import torch
	import torch.nn as nn

	class GatherModel(nn.Module):
	def __init__(self):
	super(GatherModel, self).__init__()

	def forward(self, x):
	print("Input Tensor:")
	print(x)
	// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
	func.func @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x10x20x1x1x9_i32() {
	%c0_i32 = arith.constant 0 : i32
	%c10 = arith.constant 10 : index
	%c20 = arith.constant 20 : index
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%c2 = arith.constant 2 : index
	%c5 = arith.constant 5 : index
	%c3 = arith.constant 3 : index
	// The following matmul maps very nicely onto SME with SVL=128bits. For f32, there are 4 4x4 tiles,
	// that can be assembled as a 8x8 matrix.

	// %mat_A_tr - transpose of A
	func.func @matmul(%mat_A_tr: memref<6x8xf32>, %mat_B: memref<6x8xf32>, %mat_C: memref<8x8xf32>) {
	linalg.matmul ins(%mat_A_tr, %mat_B: memref<6x8xf32>, memref<6x8xf32>)
	outs(%mat_C: memref<8x8xf32>)
	return
	}
	// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
	hal.executable.variant public @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-unknown-unknown-eabi-elf"}> {
	hal.executable.export public @pipeline_dispatch_0_depthwise_conv_2d_nhwc_hwc_1x1080x1920x1x1x43 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
	%c30 = arith.constant 30 : index
	%c18 = arith.constant 18 : index
	%c1 = arith.constant 1 : index
	hal.return %c30, %c18, %c1 : index, index, index