Yi Zhang cathyzhyi

## torchvision_shufflenet_v2.backbone.stablehlo.mlir
#loc5 = "<XLA_MLIR_DEBUGINFO_BEGIN>torchvision.models.shufflenetv2.ShuffleNetV2/torch.nn.modules.pooling.MaxPool2d_maxpool;<XLA_MLIR_DEBUGINFO_END>aten__max_pool2d"
module @IrToHlo.1937 attributes {mhlo.cross_program_prefetches = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
  func.func @main(%arg0: tensor<1000xf32> , %arg1: tensor<1000x1024xf32> , %arg2: tensor<1024xf32> , %arg3: tensor<1024xf32> , %arg4: tensor<1024xf32> , %arg5: tensor<1024xf32> , %arg6: tensor<1024x464x1x1xf32> , %arg7: tensor<232xf32> , %arg8: tensor<232xf32> , %arg9: tensor<232xf32> , %arg10: tensor<232xf32> , %arg11: tensor<232x232x1x1xf32> , %arg12: tensor<232xf32> , %arg13: tensor<232xf32> , %arg14: tensor<232xf32> , %arg15: tensor<232xf32> , %arg16: tensor<232x1x3x3xf32> , %arg17: tensor<232xf32> , %arg18: tensor<232xf32> , %arg19: tensor<232xf32> , %arg20: tensor<232xf32> , %arg21: tensor<232x232x1x1xf32> , %arg22: tensor<232xf32> , %arg23: tensor<232xf32> , %arg24: tensor<232xf32> , %arg25: tensor<232xf32

## torchvision_shufflenet_v2.backbone.stablehlo.mlir
#loc5 = "<XLA_MLIR_DEBUGINFO_BEGIN>torchvision.models.shufflenetv2.ShuffleNetV2/torch.nn.modules.pooling.MaxPool2d_maxpool;<XLA_MLIR_DEBUGINFO_END>aten__max_pool2d"
module @IrToHlo.1937 attributes {mhlo.cross_program_prefetches = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
  func.func @main(%arg0: tensor<1000xf32> , %arg1: tensor<1000x1024xf32> , %arg2: tensor<1024xf32> , %arg3: tensor<1024xf32> , %arg4: tensor<1024xf32> , %arg5: tensor<1024xf32> , %arg6: tensor<1024x464x1x1xf32> , %arg7: tensor<232xf32> , %arg8: tensor<232xf32> , %arg9: tensor<232xf32> , %arg10: tensor<232xf32> , %arg11: tensor<232x232x1x1xf32> , %arg12: tensor<232xf32> , %arg13: tensor<232xf32> , %arg14: tensor<232xf32> , %arg15: tensor<232xf32> , %arg16: tensor<232x1x3x3xf32> , %arg17: tensor<232xf32> , %arg18: tensor<232xf32> , %arg19: tensor<232xf32> , %arg20: tensor<232xf32> , %arg21: tensor<232x232x1x1xf32> , %arg22: tensor<232xf32> , %arg23: tensor<232xf32> , %arg24: tensor<232xf32> , %arg25: tensor<232xf32

## tensor_pack_llvmcpu_tileAndDistributeWorkGroups.mlir
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "skylake-avx512", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,-avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,+rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,-avx512bf16,-avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,-pku,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,-sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+

## tensor_pack_vmvx_tileAndDistributeWorkGroups.mlir
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>) {
  hal.executable.export public @tensor_pack_dispatch_0_pack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], translation_info = #iree_codegen.translation_info<VMVXDefault>} {
  ^bb0(%arg0: !hal.device):
    %c1 = arith.constant 1 : index
    hal.return %c1, %c1, %c1 : index, index, index
  }
  builtin.module {
    func.func @tensor_pack_dispatch_0_pack_f32() {
      %c256 = arith.constant 256 : index

## tensor_pack_vmvx_last_pass.mlir
// -----// IR Dump After mlir::iree_compiler::IREE::VM::OrdinalAllocationPass (iree-vm-ordinal-allocation) //----- //
vm.module public @module attributes {ordinal_counts = #vm.ordinal_counts<import_funcs = 19, export_funcs = 2, internal_funcs = 2, global_bytes = 4, global_refs = 2, rodatas = 6, rwdatas = 0>} {
  vm.global.i32 private mutable @_device_query_0 {ordinal = 0 : i32} : i32
  vm.global.ref private mutable @_pipeline_layout_0 {ordinal = 0 : i32} : !vm.ref<!hal.pipeline_layout>
  vm.global.ref private mutable @_executable_tensor_pack_dispatch_0 {ordinal = 1 : i32} : !vm.ref<!hal.executable>
  vm.rodata private @tensor_pack_dispatch_0_vmvx_bytecode_fb {alignment = 16 : i64, mime_type = "application/x-flatbuffers", ordinal = 0 : i32} dense<"0x504B03042D0000000000000021007860132CFFFFFFFFFFFFFFFF090019006D6F64756C652E6662CAFE01000001001000FC07000000000000FC07000000000000FC070000080000004952454568F8FFFF2C0000000100000024030000380300003C03000040030000440300004C04000000000F005C04000010000000060000006D6F64756

## tracy_output
[TRACY    ] Capture Name: iree-run-module @ 2024-04-03 20:33:10
[TRACY    ]     Cpu Arch: x86_64
[TRACY    ]
[TRACY-CPU]  CPU Threads: 29
[TRACY-CPU]    CPU Zones: 1695
[TRACY-CPU]   Zone Stats: 2
[TRACY-CPU]      Zone                            Count Total            Main thread     iree-poller     iree-worker-0    iree-worker-1   iree-worker-10 iree-worker-11  iree-worker-12  iree-worker-13  iree-worker-14  iree-worker-15  iree-worker-16  iree-worker-17  iree-worker-18  iree-worker-19  iree-worker-2   iree-worker-20 iree-worker-21  iree-worker-22  iree-worker-23  iree-worker-3  iree-worker-4   iree-worker-5  iree-worker-6   iree-worker-7   iree-worker-8   iree-worker-9
[TRACY-CPU]      Duration                              327.9ms(100%)    72.4844ms(100%) 18.6893ms(100%) 17.7719ms(100%)  17.1578ms(100%) 11.073ms(100%) 7.05754ms(100%) 7.04673ms(100%) 7.65951ms(100%) 7.24205ms(100%) 7.59424ms(100%) 7.02599ms(100%) 6.95218ms(100%) 6.14334ms(100%) 5.46324ms(100%) 15.8939ms(100%) 4.9626ms(100%) 4.45126ms(100%) 3

## tensor_pack_vmvx_passes.mlir
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>
#device_target_vmvx = #hal.device.target<"vmvx", [#executable_target_vmvx_bytecode_fb]>
module attributes {hal.device.targets = [#device_target_vmvx]} {
  func.func @tensor_pack(%arg0: tensor<127x256xf32>, %arg1: tensor<16x32x8x8xf32>) -> tensor<16x32x8x8xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %pack = tensor.pack %arg0 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %arg1 : tensor<127x256xf32> -> tensor<16x32x8x8xf32>
    return %pack : tensor<16x32x8x8xf32>
  }
}

## mega_lib_convolution.mlir
//only handle nchw*fchw -> nfhw
// for nhwc*fhwc ->nhwf
// permutation_input  = [0,3,1,2]
// permutation_output = [0,3,1,2]
// permutation_window = [0,2,3,1]
// If feature_group_count != 0 or batch_group_count != 0
// need to give reshaped %input, %window, %output with expanded dims.
// %input, %window are rank 5 and %output is rank 7.
#map0 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d4, d5, d6)>

## abs_4d_f16_813_passes.mlir
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
#device_target_vmvx = #hal.device.target<"vmvx", [#executable_target_vmvx_bytecode_fb]>
module attributes {hal.device.targets = [#device_target_vmvx]} {
  func.func @abs_4d_f16(%arg0: tensor<5x5x5x5xf16>) -> tensor<5x5x5x5xf16> {
    %0 = stablehlo.abs %arg0 : tensor<5x5x5x5xf16>
    return %0 : tensor<5x5x5x5xf16>
  }
}

## abs_4d_f16_813.mlir
func.func @abs_4d_f16(%a : tensor<5x5x5x5xf16>) -> tensor<5x5x5x5xf16> {
  %r = stablehlo.abs %a : tensor<5x5x5x5xf16>
  return %r : tensor<5x5x5x5xf16>
}
	#loc5 = "<XLA_MLIR_DEBUGINFO_BEGIN>torchvision.models.shufflenetv2.ShuffleNetV2/torch.nn.modules.pooling.MaxPool2d_maxpool;<XLA_MLIR_DEBUGINFO_END>aten__max_pool2d"
	module @IrToHlo.1937 attributes {mhlo.cross_program_prefetches = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
	func.func @main(%arg0: tensor<1000xf32> , %arg1: tensor<1000x1024xf32> , %arg2: tensor<1024xf32> , %arg3: tensor<1024xf32> , %arg4: tensor<1024xf32> , %arg5: tensor<1024xf32> , %arg6: tensor<1024x464x1x1xf32> , %arg7: tensor<232xf32> , %arg8: tensor<232xf32> , %arg9: tensor<232xf32> , %arg10: tensor<232xf32> , %arg11: tensor<232x232x1x1xf32> , %arg12: tensor<232xf32> , %arg13: tensor<232xf32> , %arg14: tensor<232xf32> , %arg15: tensor<232xf32> , %arg16: tensor<232x1x3x3xf32> , %arg17: tensor<232xf32> , %arg18: tensor<232xf32> , %arg19: tensor<232xf32> , %arg20: tensor<232xf32> , %arg21: tensor<232x232x1x1xf32> , %arg22: tensor<232xf32> , %arg23: tensor<232xf32> , %arg24: tensor<232xf32> , %arg25: tensor<232xf32
	// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
	hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>) {
	hal.executable.export public @tensor_pack_dispatch_0_pack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], translation_info = #iree_codegen.translation_info<VMVXDefault>} {
	^bb0(%arg0: !hal.device):
	%c1 = arith.constant 1 : index
	hal.return %c1, %c1, %c1 : index, index, index
	}
	builtin.module {
	func.func @tensor_pack_dispatch_0_pack_f32() {
	%c256 = arith.constant 256 : index
	// -----// IR Dump After mlir::iree_compiler::IREE::VM::OrdinalAllocationPass (iree-vm-ordinal-allocation) //----- //
	vm.module public @module attributes {ordinal_counts = #vm.ordinal_counts<import_funcs = 19, export_funcs = 2, internal_funcs = 2, global_bytes = 4, global_refs = 2, rodatas = 6, rwdatas = 0>} {
	vm.global.i32 private mutable @_device_query_0 {ordinal = 0 : i32} : i32
	vm.global.ref private mutable @_pipeline_layout_0 {ordinal = 0 : i32} : !vm.ref<!hal.pipeline_layout>
	vm.global.ref private mutable @_executable_tensor_pack_dispatch_0 {ordinal = 1 : i32} : !vm.ref<!hal.executable>
	vm.rodata private @tensor_pack_dispatch_0_vmvx_bytecode_fb {alignment = 16 : i64, mime_type = "application/x-flatbuffers", ordinal = 0 : i32} dense<"0x504B03042D0000000000000021007860132CFFFFFFFFFFFFFFFF090019006D6F64756C652E6662CAFE01000001001000FC07000000000000FC07000000000000FC070000080000004952454568F8FFFF2C0000000100000024030000380300003C03000040030000440300004C04000000000F005C04000010000000060000006D6F64756
	[TRACY ] Capture Name: iree-run-module @ 2024-04-03 20:33:10
	[TRACY ] Cpu Arch: x86_64
	[TRACY ]
	[TRACY-CPU] CPU Threads: 29
	[TRACY-CPU] CPU Zones: 1695
	[TRACY-CPU] Zone Stats: 2
	[TRACY-CPU] Zone Count Total Main thread iree-poller iree-worker-0 iree-worker-1 iree-worker-10 iree-worker-11 iree-worker-12 iree-worker-13 iree-worker-14 iree-worker-15 iree-worker-16 iree-worker-17 iree-worker-18 iree-worker-19 iree-worker-2 iree-worker-20 iree-worker-21 iree-worker-22 iree-worker-23 iree-worker-3 iree-worker-4 iree-worker-5 iree-worker-6 iree-worker-7 iree-worker-8 iree-worker-9
	[TRACY-CPU] Duration 327.9ms(100%) 72.4844ms(100%) 18.6893ms(100%) 17.7719ms(100%) 17.1578ms(100%) 11.073ms(100%) 7.05754ms(100%) 7.04673ms(100%) 7.65951ms(100%) 7.24205ms(100%) 7.59424ms(100%) 7.02599ms(100%) 6.95218ms(100%) 6.14334ms(100%) 5.46324ms(100%) 15.8939ms(100%) 4.9626ms(100%) 4.45126ms(100%) 3
	// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
	#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>
	#device_target_vmvx = #hal.device.target<"vmvx", [#executable_target_vmvx_bytecode_fb]>
	module attributes {hal.device.targets = [#device_target_vmvx]} {
	func.func @tensor_pack(%arg0: tensor<127x256xf32>, %arg1: tensor<16x32x8x8xf32>) -> tensor<16x32x8x8xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%pack = tensor.pack %arg0 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %arg1 : tensor<127x256xf32> -> tensor<16x32x8x8xf32>
	return %pack : tensor<16x32x8x8xf32>
	}
	}
	//only handle nchw*fchw -> nfhw
	// for nhwc*fhwc ->nhwf
	// permutation_input = [0,3,1,2]
	// permutation_output = [0,3,1,2]
	// permutation_window = [0,2,3,1]
	// If feature_group_count != 0 or batch_group_count != 0
	// need to give reshaped %input, %window, %output with expanded dims.
	// %input, %window are rank 5 and %output is rank 7.
	#map0 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
	#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d4, d5, d6)>
	func.func @abs_4d_f16(%a : tensor<5x5x5x5xf16>) -> tensor<5x5x5x5xf16> {
	%r = stablehlo.abs %a : tensor<5x5x5x5xf16>
	return %r : tensor<5x5x5x5xf16>
	}