Prashant Kumar pashu123

## ir.mlir
#matmul_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+avx512f", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
func.func @mmt4d_bias_relu_fusion_dispatch_0_generic_DxDx16x16_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load[0] : i32
  %1 = hal.interface.constant.load[1] : i32
  %2 = hal.interface.constant.load[2] : i32
  %3 = hal.interface.constant.load[3] : i32

## new_ir.mlir
  %46 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0]]>} ins(%41, %42 : tensor<?x?x3200x16x1xf32>, tensor<?x540x3200x16x1xf16>) outs(%45 : tensor<?x?x540x16x16xf32>) -> tensor<?x?x540x16x16xf32>
util.func public @matmul_broad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_broad(%input0: tensor<?x?x3200xf32>, %input1: tensor<8640x3200xf16>) -> (%output0: tensor<?x?x8640xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x

## new_ir.mlir
util.func public @matmul_broad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_broad(%input0: tensor<?x?x3200xf32>, %input1: tensor<8640x3200xf16>) -> (%output0: tensor<?x?x8640xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f16
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x3200xf32>{%0, %1}
  %3 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<8640x3200xf16>
  %4 = tensor.empty(%0) : tensor<?x8640x3200xf16>

## new_ir.mlir
//util.func public @matmul_broad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_broad(%input0: tensor<?x?x3200xf32>, %input1: tensor<8640x3200xf16>) -> (%output0: tensor<?x?x8640xf32>)"}} {
//  %cst = arith.constant 0.000000e+00 : f32
//  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
//  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
//  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x3200xf32>{%0, %1}
//  %3 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<8640x3200xf16>
//  %4 = tensor.empty() : tensor<540x3200x16x1xf16>
//  %pack = tensor.pack %3 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %4 : tensor<8640x3200xf16> -> tensor<540x3200x16x1xf16>
//  %collapsed = tensor.collapse_shape %pack [[0], [1], [2, 3]] : tensor<540x3200x16x1xf16> into tensor<540x3200x16xf16>
//  %5 = tensor.empty(%0) : tensor<

## sum.mlir
module attributes {torch.debug_module_name = "SumModule"} {
  ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  func.func @forward(%arg0: tensor<1048576xf32>) -> tensor<f32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = tensor.empty() : tensor<f32>
    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<f32>) -> tensor<f32>
    %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%arg0 : tensor<1048576xf32>) outs(%1 : tensor<f32>) {
    ^bb0(%in: f32, %out: f32):
      %3 = arith.addf %in, %out : f32
      linalg.yield %3 : f32

## all.txt
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_local]} {
  util.func public @matmul_broad(%arg0: tensor<?x?x3200xf32>, %arg1: tensor<8640x3200xf16>) -> tensor<?x?x8640xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index

## sfmax.py
import numpy as np


def compare_arrays(expected, computed):
    # Check if the shapes of the arrays match
    if expected.shape != computed.shape:
        print("Arrays have different shapes.")
        return

    # Find where mismatches occur (including handling NaNs)

## py_out.txt
False
False
False
Mismatch at index (np.int64(0), np.int64(2), np.int64(1)): golden=-1.6139899492263794, iree=-0.0
Mismatch at index (np.int64(0), np.int64(2), np.int64(9)): golden=-1.1718499660491943, iree=-0.0
Mismatch at index (np.int64(0), np.int64(2), np.int64(10)): golden=-1.594499945640564, iree=-0.0
Mismatch at index (np.int64(0), np.int64(2), np.int64(11)): golden=-1.9860199689865112, iree=-0.0
Mismatch at index (np.int64(0), np.int64(2), np.int64(18)): golden=-1.1132500171661377, iree=-0.0
Mismatch at index (np.int64(0), np.int64(2), np.int64(19)): golden=-2.1459200382232666, iree=-0.0
Mismatch at index (np.int64(0), np.int64(2), np.int64(20)): golden=-1.3908900022506714, iree=-0.0

## sfmax.mlir
//func.func @softmax(%arg0: tensor<2x24x1178x1178xf32>) -> tensor<2x24x1178x1178xf32> {
//  %c0 = arith.constant 0 : index
//  %0 = tensor.empty() : tensor<2x24x1178x1178xf32>
//  %1 = linalg.softmax dimension(3) ins(%arg0 : tensor<2x24x1178x1178xf32>) outs(%0 : tensor<2x24x1178x1178xf32>) -> tensor<2x24x1178x1178xf32>
//  return %1 : tensor<2x24x1178x1178xf32>
//}

func.func @softmax(%arg0: tensor<2x24x1178x1178xf32>) -> tensor<2x24x1178xf32> {
  %4 = tensor.empty() : tensor<2x24x1178xf32>
  %cst = arith.constant -3.40282347E+38 : f32

## input.mlir
func.func @matmul_broad_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32() attributes {translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
  %c1 = arith.constant 1 : index
  %c3200 = arith.constant 3200 : index
  %c540 = arith.constant 540 : index
  %c55296000 = arith.constant 55296000 : index
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load[0] : i32
  %1 = hal.interface.constant.load[1] : i32
	#matmul_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]>
	#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+avx512f", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
	func.func @mmt4d_bias_relu_fusion_dispatch_0_generic_DxDx16x16_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
	%c0 = arith.constant 0 : index
	%c32_i64 = arith.constant 32 : i64
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.constant.load[0] : i32
	%1 = hal.interface.constant.load[1] : i32
	%2 = hal.interface.constant.load[2] : i32
	%3 = hal.interface.constant.load[3] : i32
	%46 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0]]>} ins(%41, %42 : tensor<?x?x3200x16x1xf32>, tensor<?x540x3200x16x1xf16>) outs(%45 : tensor<?x?x540x16x16xf32>) -> tensor<?x?x540x16x16xf32>
	util.func public @matmul_broad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_broad(%input0: tensor<?x?x3200xf32>, %input1: tensor<8640x3200xf16>) -> (%output0: tensor<?x?x8640xf32>)"}} {
	%cst = arith.constant 0.000000e+00 : f16
	%c1 = arith.constant 1 : index
	%c0 = arith.constant 0 : index
	%cst_0 = arith.constant 0.000000e+00 : f32
	%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
	%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
	%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x
	//util.func public @matmul_broad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_broad(%input0: tensor<?x?x3200xf32>, %input1: tensor<8640x3200xf16>) -> (%output0: tensor<?x?x8640xf32>)"}} {
	// %cst = arith.constant 0.000000e+00 : f32
	// %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
	// %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
	// %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?x3200xf32>{%0, %1}
	// %3 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<8640x3200xf16>
	// %4 = tensor.empty() : tensor<540x3200x16x1xf16>
	// %pack = tensor.pack %3 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %4 : tensor<8640x3200xf16> -> tensor<540x3200x16x1xf16>
	// %collapsed = tensor.collapse_shape %pack [[0], [1], [2, 3]] : tensor<540x3200x16x1xf16> into tensor<540x3200x16xf16>
	// %5 = tensor.empty(%0) : tensor<
	module attributes {torch.debug_module_name = "SumModule"} {
	ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
	func.func @forward(%arg0: tensor<1048576xf32>) -> tensor<f32> {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = tensor.empty() : tensor<f32>
	%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<f32>) -> tensor<f32>
	%2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%arg0 : tensor<1048576xf32>) outs(%1 : tensor<f32>) {
	^bb0(%in: f32, %out: f32):
	%3 = arith.addf %in, %out : f32
	linalg.yield %3 : f32
	// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
	#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
	#map = affine_map<(d0, d1, d2) -> (d1, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
	#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]>
	module attributes {hal.device.targets = [#device_target_local]} {
	util.func public @matmul_broad(%arg0: tensor<?x?x3200xf32>, %arg1: tensor<8640x3200xf16>) -> tensor<?x?x8640xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	import numpy as np


	def compare_arrays(expected, computed):
	# Check if the shapes of the arrays match
	if expected.shape != computed.shape:
	print("Arrays have different shapes.")
	return

	# Find where mismatches occur (including handling NaNs)
	False
	False
	False
	Mismatch at index (np.int64(0), np.int64(2), np.int64(1)): golden=-1.6139899492263794, iree=-0.0
	Mismatch at index (np.int64(0), np.int64(2), np.int64(9)): golden=-1.1718499660491943, iree=-0.0
	Mismatch at index (np.int64(0), np.int64(2), np.int64(10)): golden=-1.594499945640564, iree=-0.0
	Mismatch at index (np.int64(0), np.int64(2), np.int64(11)): golden=-1.9860199689865112, iree=-0.0
	Mismatch at index (np.int64(0), np.int64(2), np.int64(18)): golden=-1.1132500171661377, iree=-0.0
	Mismatch at index (np.int64(0), np.int64(2), np.int64(19)): golden=-2.1459200382232666, iree=-0.0
	Mismatch at index (np.int64(0), np.int64(2), np.int64(20)): golden=-1.3908900022506714, iree=-0.0
	//func.func @softmax(%arg0: tensor<2x24x1178x1178xf32>) -> tensor<2x24x1178x1178xf32> {
	// %c0 = arith.constant 0 : index
	// %0 = tensor.empty() : tensor<2x24x1178x1178xf32>
	// %1 = linalg.softmax dimension(3) ins(%arg0 : tensor<2x24x1178x1178xf32>) outs(%0 : tensor<2x24x1178x1178xf32>) -> tensor<2x24x1178x1178xf32>
	// return %1 : tensor<2x24x1178x1178xf32>
	//}

	func.func @softmax(%arg0: tensor<2x24x1178x1178xf32>) -> tensor<2x24x1178xf32> {
	%4 = tensor.empty() : tensor<2x24x1178xf32>
	%cst = arith.constant -3.40282347E+38 : f32
	func.func @matmul_broad_dispatch_2_batch_mmt4d_DxDx540x3200x16x16x1_f32xf16xf32() attributes {translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
	%c1 = arith.constant 1 : index
	%c3200 = arith.constant 3200 : index
	%c540 = arith.constant 540 : index
	%c55296000 = arith.constant 55296000 : index
	%c0 = arith.constant 0 : index
	%c32_i64 = arith.constant 32 : i64
	%cst = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.constant.load[0] : i32
	%1 = hal.interface.constant.load[1] : i32