Skip to content

Instantly share code, notes, and snippets.

#loc5 = "<XLA_MLIR_DEBUGINFO_BEGIN>torchvision.models.shufflenetv2.ShuffleNetV2/torch.nn.modules.pooling.MaxPool2d_maxpool;<XLA_MLIR_DEBUGINFO_END>aten__max_pool2d"
module @IrToHlo.1937 attributes {mhlo.cross_program_prefetches = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
func.func @main(%arg0: tensor<1000xf32> , %arg1: tensor<1000x1024xf32> , %arg2: tensor<1024xf32> , %arg3: tensor<1024xf32> , %arg4: tensor<1024xf32> , %arg5: tensor<1024xf32> , %arg6: tensor<1024x464x1x1xf32> , %arg7: tensor<232xf32> , %arg8: tensor<232xf32> , %arg9: tensor<232xf32> , %arg10: tensor<232xf32> , %arg11: tensor<232x232x1x1xf32> , %arg12: tensor<232xf32> , %arg13: tensor<232xf32> , %arg14: tensor<232xf32> , %arg15: tensor<232xf32> , %arg16: tensor<232x1x3x3xf32> , %arg17: tensor<232xf32> , %arg18: tensor<232xf32> , %arg19: tensor<232xf32> , %arg20: tensor<232xf32> , %arg21: tensor<232x232x1x1xf32> , %arg22: tensor<232xf32> , %arg23: tensor<232xf32> , %arg24: tensor<232xf32> , %arg25: tensor<232xf32
#loc5 = "<XLA_MLIR_DEBUGINFO_BEGIN>torchvision.models.shufflenetv2.ShuffleNetV2/torch.nn.modules.pooling.MaxPool2d_maxpool;<XLA_MLIR_DEBUGINFO_END>aten__max_pool2d"
module @IrToHlo.1937 attributes {mhlo.cross_program_prefetches = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
func.func @main(%arg0: tensor<1000xf32> , %arg1: tensor<1000x1024xf32> , %arg2: tensor<1024xf32> , %arg3: tensor<1024xf32> , %arg4: tensor<1024xf32> , %arg5: tensor<1024xf32> , %arg6: tensor<1024x464x1x1xf32> , %arg7: tensor<232xf32> , %arg8: tensor<232xf32> , %arg9: tensor<232xf32> , %arg10: tensor<232xf32> , %arg11: tensor<232x232x1x1xf32> , %arg12: tensor<232xf32> , %arg13: tensor<232xf32> , %arg14: tensor<232xf32> , %arg15: tensor<232xf32> , %arg16: tensor<232x1x3x3xf32> , %arg17: tensor<232xf32> , %arg18: tensor<232xf32> , %arg19: tensor<232xf32> , %arg20: tensor<232xf32> , %arg21: tensor<232x232x1x1xf32> , %arg22: tensor<232xf32> , %arg23: tensor<232xf32> , %arg24: tensor<232xf32> , %arg25: tensor<232xf32
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "skylake-avx512", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,-avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,+rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,-avx512bf16,-avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,-pku,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,-sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>) {
hal.executable.export public @tensor_pack_dispatch_0_pack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], translation_info = #iree_codegen.translation_info<VMVXDefault>} {
^bb0(%arg0: !hal.device):
%c1 = arith.constant 1 : index
hal.return %c1, %c1, %c1 : index, index, index
}
builtin.module {
func.func @tensor_pack_dispatch_0_pack_f32() {
%c256 = arith.constant 256 : index
// -----// IR Dump After mlir::iree_compiler::IREE::VM::OrdinalAllocationPass (iree-vm-ordinal-allocation) //----- //
vm.module public @module attributes {ordinal_counts = #vm.ordinal_counts<import_funcs = 19, export_funcs = 2, internal_funcs = 2, global_bytes = 4, global_refs = 2, rodatas = 6, rwdatas = 0>} {
vm.global.i32 private mutable @_device_query_0 {ordinal = 0 : i32} : i32
vm.global.ref private mutable @_pipeline_layout_0 {ordinal = 0 : i32} : !vm.ref<!hal.pipeline_layout>
vm.global.ref private mutable @_executable_tensor_pack_dispatch_0 {ordinal = 1 : i32} : !vm.ref<!hal.executable>
vm.rodata private @tensor_pack_dispatch_0_vmvx_bytecode_fb {alignment = 16 : i64, mime_type = "application/x-flatbuffers", ordinal = 0 : i32} dense<"0x504B03042D0000000000000021007860132CFFFFFFFFFFFFFFFF090019006D6F64756C652E6662CAFE01000001001000FC07000000000000FC07000000000000FC070000080000004952454568F8FFFF2C0000000100000024030000380300003C03000040030000440300004C04000000000F005C04000010000000060000006D6F64756
[TRACY ] Capture Name: iree-run-module @ 2024-04-03 20:33:10
[TRACY ] Cpu Arch: x86_64
[TRACY ]
[TRACY-CPU] CPU Threads: 29
[TRACY-CPU] CPU Zones: 1695
[TRACY-CPU] Zone Stats: 2
[TRACY-CPU] Zone Count Total Main thread iree-poller iree-worker-0 iree-worker-1 iree-worker-10 iree-worker-11 iree-worker-12 iree-worker-13 iree-worker-14 iree-worker-15 iree-worker-16 iree-worker-17 iree-worker-18 iree-worker-19 iree-worker-2 iree-worker-20 iree-worker-21 iree-worker-22 iree-worker-23 iree-worker-3 iree-worker-4 iree-worker-5 iree-worker-6 iree-worker-7 iree-worker-8 iree-worker-9
[TRACY-CPU] Duration 327.9ms(100%) 72.4844ms(100%) 18.6893ms(100%) 17.7719ms(100%) 17.1578ms(100%) 11.073ms(100%) 7.05754ms(100%) 7.04673ms(100%) 7.65951ms(100%) 7.24205ms(100%) 7.59424ms(100%) 7.02599ms(100%) 6.95218ms(100%) 6.14334ms(100%) 5.46324ms(100%) 15.8939ms(100%) 4.9626ms(100%) 4.45126ms(100%) 3
This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>
#device_target_vmvx = #hal.device.target<"vmvx", [#executable_target_vmvx_bytecode_fb]>
module attributes {hal.device.targets = [#device_target_vmvx]} {
func.func @tensor_pack(%arg0: tensor<127x256xf32>, %arg1: tensor<16x32x8x8xf32>) -> tensor<16x32x8x8xf32> {
%cst = arith.constant 0.000000e+00 : f32
%pack = tensor.pack %arg0 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %arg1 : tensor<127x256xf32> -> tensor<16x32x8x8xf32>
return %pack : tensor<16x32x8x8xf32>
}
}
//only handle nchw*fchw -> nfhw
// for nhwc*fhwc ->nhwf
// permutation_input = [0,3,1,2]
// permutation_output = [0,3,1,2]
// permutation_window = [0,2,3,1]
// If feature_group_count != 0 or batch_group_count != 0
// need to give reshaped %input, %window, %output with expanded dims.
// %input, %window are rank 5 and %output is rank 7.
#map0 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d4, d5, d6)>
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
#device_target_vmvx = #hal.device.target<"vmvx", [#executable_target_vmvx_bytecode_fb]>
module attributes {hal.device.targets = [#device_target_vmvx]} {
func.func @abs_4d_f16(%arg0: tensor<5x5x5x5xf16>) -> tensor<5x5x5x5xf16> {
%0 = stablehlo.abs %arg0 : tensor<5x5x5x5xf16>
return %0 : tensor<5x5x5x5xf16>
}
}
func.func @abs_4d_f16(%a : tensor<5x5x5x5xf16>) -> tensor<5x5x5x5xf16> {
%r = stablehlo.abs %a : tensor<5x5x5x5xf16>
return %r : tensor<5x5x5x5xf16>
}