Max191/pure_unpack.mlir

## pure_unpack.mlir
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
  hal.executable private @main$async_dispatch_330 {
    hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>) {
      hal.executable.export public @main$async_dispatch_330_unpack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @main$async_dispatch_330_unpack_f32() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
          %0 = hal.interface.constant.load[0] : i32
          %1 = hal.interface.constant.load[1] : i32
          %2 = arith.index_castui %0 : i32 to index
          %3 = arith.index_castui %1 : i32 to index
          %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>>
          %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
          %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0], sizes = [64, 1828, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> -> tensor<64x1828x8x16x16xf32>
          %7 = tensor.empty() : tensor<29241x128x64xf32>
          %unpack = tensor.unpack %6 outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 1]]>} : tensor<64x1828x8x16x16xf32> -> tensor<29241x128x64xf32>
          flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0, 0], sizes = [29241, 128, 64], strides = [1, 1, 1] : tensor<29241x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
          return
        }
      }
    }
  }
  util.global private mutable @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
  util.initializer {
    %c9583067136 = arith.constant 9583067136 : index
    %c-1_i64 = arith.constant -1 : i64
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c9583067136}
    util.global.store %buffer, @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
    util.return
  }
  util.func public @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1 = arith.constant 1 : index
    %c4791533568 = arith.constant 4791533568 : index
    %c-1783627776_i32 = arith.constant -1783627776 : i32
    %c1552941056_i32 = arith.constant 1552941056 : i32
    %c0 = arith.constant 0 : index
    %0 = arith.index_cast %arg0 : i32 to index
    %device_0 = hal.devices.get %c0 : !hal.device
    %cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c1552941056_i32, %c-1783627776_i32]) : i32, i32
    %main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer = util.global.load @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c0, %c4791533568],
      %c1 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c4791533568, %c4791533568]
    ])
    %workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index, index, index
    %exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main$async_dispatch_330) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index
    scf.for %arg1 = %c0 to %0 step %c1 {
      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    }
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %1 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    util.return
  }
}

## unpack_transpose.mlir
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
  hal.executable private @main$async_dispatch_330 {
    hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>) {
      hal.executable.export public @main$async_dispatch_330_unpack_transpose_29241x128x64_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @main$async_dispatch_330_unpack_transpose_29241x128x64_f32() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
          %0 = hal.interface.constant.load[0] : i32
          %1 = hal.interface.constant.load[1] : i32
          %2 = arith.index_castui %0 : i32 to index
          %3 = arith.index_castui %1 : i32 to index
          %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>>
          %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
          %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0], sizes = [64, 1828, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> -> tensor<64x1828x8x16x16xf32>
          %7 = tensor.empty() : tensor<29241x128x64xf32>
          %8 = tensor.empty() : tensor<64x29241x128xf32>
          %unpack = tensor.unpack %6 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %8 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<64x1828x8x16x16xf32> -> tensor<64x29241x128xf32>
          %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack : tensor<64x29241x128xf32>) outs(%7 : tensor<29241x128x64xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 16], [0, 0, 0], [0, 0, 0]]>} {
          ^bb0(%in: f32, %out: f32):
            linalg.yield %in : f32
          } -> tensor<29241x128x64xf32>
          flow.dispatch.tensor.store %9, %5, offsets = [0, 0, 0], sizes = [29241, 128, 64], strides = [1, 1, 1] : tensor<29241x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
          return
        }
      }
    }
  }
  util.global private mutable @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer
  util.initializer {
    %c9583067136 = arith.constant 9583067136 : index
    %c-1_i64 = arith.constant -1 : i64
    %c0 = arith.constant 0 : index
    %device_0 = hal.devices.get %c0 : !hal.device
    %allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c9583067136}
    util.global.store %buffer, @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer
    util.return
  }
  util.func public @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c1 = arith.constant 1 : index
    %c4791533568 = arith.constant 4791533568 : index
    %c-1783627776_i32 = arith.constant -1783627776 : i32
    %c1552941056_i32 = arith.constant 1552941056 : i32
    %c0 = arith.constant 0 : index
    %0 = arith.index_cast %arg0 : i32 to index
    %device_0 = hal.devices.get %c0 : !hal.device
    %cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
    hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c1552941056_i32, %c-1783627776_i32]) : i32, i32
    %main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer = util.global.load @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer)[%c0, %c4791533568],
      %c1 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_transpose_29241x128x64_f32_buffer : !hal.buffer)[%c4791533568, %c4791533568]
    ])
    %workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_transpose_29241x128x64_f32) : index, index, index
    %exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main$async_dispatch_330) : !hal.executable
    %ordinal = hal.executable.export.ordinal target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_transpose_29241x128x64_f32) : index
    scf.for %arg1 = %c0 to %0 step %c1 {
      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    }
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %1 = util.null : !hal.fence
    %fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    util.return
  }
}
	module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
	hal.executable private @main$async_dispatch_330 {
	hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>) {
	hal.executable.export public @main$async_dispatch_330_unpack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
	^bb0(%arg0: !hal.device):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @main$async_dispatch_330_unpack_f32() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
	%0 = hal.interface.constant.load[0] : i32
	%1 = hal.interface.constant.load[1] : i32
	%2 = arith.index_castui %0 : i32 to index
	%3 = arith.index_castui %1 : i32 to index
	%4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>>
	%5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
	%6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0], sizes = [64, 1828, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1828x8x16x16xf32>> -> tensor<64x1828x8x16x16xf32>
	%7 = tensor.empty() : tensor<29241x128x64xf32>
	%unpack = tensor.unpack %6 outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [16, 16, 1]]>} : tensor<64x1828x8x16x16xf32> -> tensor<29241x128x64xf32>
	flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0, 0], sizes = [29241, 128, 64], strides = [1, 1, 1] : tensor<29241x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<29241x128x64xf32>>
	return
	}
	}
	}
	}
	util.global private mutable @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
	util.initializer {
	%c9583067136 = arith.constant 9583067136 : index
	%c-1_i64 = arith.constant -1 : i64
	%c0 = arith.constant 0 : index
	%device_0 = hal.devices.get %c0 : !hal.device
	%allocator = hal.device.allocator<%device_0 : !hal.device> : !hal.allocator
	%buffer = hal.allocator.allocate<%allocator : !hal.allocator> affinity(%c-1_i64) type("DeviceVisible\|DeviceLocal") usage("TransferSource\|TransferTarget\|Transfer\|DispatchStorageRead\|DispatchStorageWrite\|DispatchStorage") : !hal.buffer{%c9583067136}
	util.global.store %buffer, @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
	util.return
	}
	util.func public @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
	%c-1_i32 = arith.constant -1 : i32
	%c-1_i64 = arith.constant -1 : i64
	%c1 = arith.constant 1 : index
	%c4791533568 = arith.constant 4791533568 : index
	%c-1783627776_i32 = arith.constant -1783627776 : i32
	%c1552941056_i32 = arith.constant 1552941056 : i32
	%c0 = arith.constant 0 : index
	%0 = arith.index_cast %arg0 : i32 to index
	%device_0 = hal.devices.get %c0 : !hal.device
	%cmd = hal.command_buffer.create device(%device_0 : !hal.device) mode("OneShot\|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
	%pipeline_layout = hal.pipeline_layout.lookup device(%device_0 : !hal.device) layout(<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
	hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c1552941056_i32, %c-1783627776_i32]) : i32, i32
	%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer = util.global.load @main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer
	hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
	%c0 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c0, %c4791533568],
	%c1 = (%main$async_dispatch_330_embedded_elf_x86_64_main$async_dispatch_330_unpack_f32_buffer : !hal.buffer)[%c4791533568, %c4791533568]
	])
	%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device_0 : !hal.device) target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index, index, index
	%exe = hal.executable.lookup device(%device_0 : !hal.device) executable(@main$async_dispatch_330) : !hal.executable
	%ordinal = hal.executable.export.ordinal target(@main$async_dispatch_330::@embedded_elf_x86_64::@main$async_dispatch_330_unpack_f32) : index
	scf.for %arg1 = %c0 to %0 step %c1 {
	hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[%ordinal] workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
	hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch\|CommandRetire") target("CommandIssue\|Dispatch") flags("None")
	}
	hal.command_buffer.finalize<%cmd : !hal.command_buffer>
	%1 = util.null : !hal.fence
	%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
	hal.device.queue.execute<%device_0 : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
	%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
	util.status.check_ok %status, "failed to wait on timepoint"
	util.return
	}
	}