bjacob/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Attempt at ukernel fallback to codegen

This is to document a short-lived attempt at solving #15784 by implementing the idea laid out in the original issue description. This changes the mmt4 ukernel to return a second return value which is a status code, and changes the mmt4d-to-ukernel lowering to create a scf.if based on that status code:
%62:2 = iree_codegen.ukernel.generic "iree_uk_mmt4d" ins(%59, %60 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%61 : tensor<1x1x16x16xf32>) (%c1, %c1, %dim, %c16_i32, %c16_i32, %c1_i32, %c1281_i32 : index, index, index, i32, i32, i32, i32) fn_def_attrs {hal.import.bitcode = true, hal.import.cconv = 1 : i32, hal.import.fields = ["processor_data"]} strided_outer_dims(1) -> tensor<1x1x16x16xf32>, i32
%63 = arith.cmpi eq, %62#1, %c0_i32 : i32
%64 = scf.if %63 -> (tensor<1x1x16x16xf32>) {
  scf.yield %62#0 : tensor<1x1x16x16xf32>
} else {
  %65 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1]]>, "no-ukernel fallback"} ins(%59, %60 : tensor<1x?x16x1xf32>, tensor<1x?x16x1xf32>) outs(%61 : tensor<1x1x16x16xf32>) -> tensor<1x1x16x16xf32>
  scf.yield %65 : tensor<1x1x16x16xf32>
}
The code diff is attached below (patch.diff).
My testcase is also attached below (matmul.mlir).
My reproduction command line:
tools/iree-compile ~/matmul.mlir -o /tmp/matmul.vmfb \
  --iree-hal-target-backends=llvm-cpu \
  --iree-llvmcpu-target-cpu=znver4 \
  --iree-llvmcpu-enable-ukernels=all \
  --mlir-print-op-on-diagnostic=false \
  --mlir-disable-threading \
  --mlir-print-ir-after-all

The immediate problem (probably solvable)

Right now this fails in bufferization, with an error complaining that the old ukernel-on-tensors op being replaced still has uses. Indeed it does, via the return value used in the scf.if condition.
See attached ir-after-all + custom prints log.txt.
The deeper problem (probably not currently solvable)

Mahesh let me know that if I overcome this bufferization issue, the next problem will be with vectorization. You can see that this mmt4d here is on a 16x16 tile, it's already at a deeply nested level in the overall structure. As this 16x16 tile gets vectorized, the problem is that the scf.if being introduced here will throw a wrench in the way that vectorization works and expects vector-tensor transfers to be transformed.

  
## log.txt
// -----// IR Dump After AssignTargetDevicesPass (iree-hal-assign-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After AutoInputConversionPipeline (iree-auto-input-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After IREEImportPublic (iree-import-public) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After ImportMLProgram (iree-import-ml-program) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After SanitizeModuleNames (iree-sanitize-module-names) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After ConvertMeshToFlow (iree-convert-mesh-to-flow) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = call @_matmul_dynamic(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %9, %c0 : tensor<?x?xf32>
    %c1 = arith.constant 1 : index
    %dim_0 = tensor.dim %9, %c1 : tensor<?x?xf32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%dim, %dim_0} -> !hal.buffer_view
    return %10 : !hal.buffer_view
  }
  func.func private @_matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func private @_matmul_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
  %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
  return %0 : tensor<?x?xf32>
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = call @_matmul_dynamic(%2, %5, %8) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
  %dim = tensor.dim %9, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %9, %c1 : tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%dim, %dim_0} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After Inliner (inline) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %10 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %10 : !hal.buffer_view
  }
}


// -----// IR Dump After DemoteF64ToF32 (iree-util-demote-f64-to-f32) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %10 : !hal.buffer_view
  }
}


// -----// IR Dump After RemoveZeroExtentTensors (iree-global-opt-remove-zero-extent-tensors) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After DetachElementwiseFromNamedOps (iree-global-opt-detach-elementwise-from-named-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After LinalgNamedOpConversion (linalg-named-op-conversion) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After Convert1X1FilterConv2DToMatmul (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After EraseUnusedLinalgOperands (iree-global-opt-erase-unused-linalg-operands) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %10 : !hal.buffer_view
  }
}


// -----// IR Dump After ExpandTensorShapes (iree-global-opt-expand-tensor-shapes) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
    %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %10 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertElementwiseToLinalg (convert-elementwise-to-linalg) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After DecomposeConcat (iree-global-opt-decompose-concat) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After FoldUnitExtentDims (iree-flow-fold-unit-extent-dims) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After FuseDequantizationMatmul (iree-global-opt-fuse-dequantization-matmul) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = linalg.matmul ins(%2, %5 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
  %10 = hal.tensor.export %9 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %10 : !hal.buffer_view
}

// -----// IR Dump After SetEncoding (iree-global-opt-set-encoding) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
    %dim = tensor.dim %2, %c0 : tensor<?x?xf32>
    %10 = affine.apply #map3()[%9#0, %dim]
    %dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32>
    %11 = affine.apply #map3()[%9#1, %dim_0]
    %padded = tensor.pad %2 low[0, 0] high[%10, %11] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f32
    } : tensor<?x?xf32> to tensor<?x?xf32>
    %12 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
    %13:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
    %dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32>
    %14 = affine.apply #map3()[%13#0, %dim_1]
    %dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32>
    %15 = affine.apply #map3()[%13#1, %dim_2]
    %padded_3 = tensor.pad %5 low[0, 0] high[%14, %15] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f32
    } : tensor<?x?xf32> to tensor<?x?xf32>
    %16 = iree_linalg_ext.set_encoding %padded_3 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
    %17:2 = iree_linalg_ext.upper_bound_tile_size tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
    %dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32>
    %18 = affine.apply #map3()[%17#0, %dim_4]
    %dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32>
    %19 = affine.apply #map3()[%17#1, %dim_5]
    %padded_6 = tensor.pad %8 low[0, 0] high[%18, %19] {
    ^bb0(%arg3: index, %arg4: index):
      tensor.yield %cst : f32
    } : tensor<?x?xf32> to tensor<?x?xf32>
    %20 = iree_linalg_ext.set_encoding %padded_6 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
    %21 = linalg.matmul ins(%12, %16 : tensor<?x?xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) outs(%20 : tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>>
    %dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32>
    %dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32>
    %22 = iree_linalg_ext.unset_encoding %21 : tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?xf32>
    %extracted_slice = tensor.extract_slice %22[0, 0] [%dim_7, %dim_8] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
    %23 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %23 : !hal.buffer_view
  }
}


// -----// IR Dump After CPUMaterializeUpperBoundTileSize (iree-codegen-cpu-materialize-upper-bound-tile-size) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c16 = arith.constant 16 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %dim = tensor.dim %2, %c0 : tensor<?x?xf32>
  %9 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim]
  %dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32>
  %10 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c1, %dim_0]
  %padded = tensor.pad %2 low[0, 0] high[%9, %10] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %cst : f32
  } : tensor<?x?xf32> to tensor<?x?xf32>
  %11 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
  %dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32>
  %12 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c1, %dim_1]
  %dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32>
  %13 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim_2]
  %padded_3 = tensor.pad %5 low[0, 0] high[%12, %13] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %cst : f32
  } : tensor<?x?xf32> to tensor<?x?xf32>
  %14 = iree_linalg_ext.set_encoding %padded_3 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
  %dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32>
  %15 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim_4]
  %dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %16 = affine.apply affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>()[%c16, %dim_5]
  %padded_6 = tensor.pad %8 low[0, 0] high[%15, %16] {
  ^bb0(%arg3: index, %arg4: index):
    tensor.yield %cst : f32
  } : tensor<?x?xf32> to tensor<?x?xf32>
  %17 = iree_linalg_ext.set_encoding %padded_6 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
  %18 = linalg.matmul ins(%11, %14 : tensor<?x?xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%17 : tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
  %dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %19 = iree_linalg_ext.unset_encoding %18 : tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32>
  %extracted_slice = tensor.extract_slice %19[0, 0] [%dim_7, %dim_8] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
  %20 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %20 : !hal.buffer_view
}

// -----// IR Dump After CPUMaterializeEncoding (iree-codegen-cpu-materialize-encoding) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %dim = tensor.dim %2, %c0 : tensor<?x?xf32>
  %dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32>
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim]
  %10 = tensor.empty(%9, %dim_0) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32>
  %dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim_2]
  %12 = tensor.empty(%11, %dim_1) : tensor<?x?x16x1xf32>
  %pack_3 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim_4]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%dim_5]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_6 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_3 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_6 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32>
  %dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32>
  %17 = tensor.empty(%dim_7, %dim_8) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After MaterializeHomogeneousEncodings (iree-global-opt-materialize-homogeneous-encodings) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %dim = tensor.dim %2, %c0 : tensor<?x?xf32>
    %dim_0 = tensor.dim %2, %c1 : tensor<?x?xf32>
    %9 = affine.apply #map()[%dim]
    %10 = tensor.empty(%9, %dim_0) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %dim_1 = tensor.dim %5, %c0 : tensor<?x?xf32>
    %dim_2 = tensor.dim %5, %c1 : tensor<?x?xf32>
    %11 = affine.apply #map()[%dim_2]
    %12 = tensor.empty(%11, %dim_1) : tensor<?x?x16x1xf32>
    %pack_3 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %dim_4 = tensor.dim %8, %c0 : tensor<?x?xf32>
    %dim_5 = tensor.dim %8, %c1 : tensor<?x?xf32>
    %13 = affine.apply #map()[%dim_4]
    %14 = affine.apply #map()[%dim_5]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_6 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_3 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_6 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %dim_7 = tensor.dim %8, %c0 : tensor<?x?xf32>
    %dim_8 = tensor.dim %8, %c1 : tensor<?x?xf32>
    %17 = tensor.empty(%dim_7, %dim_8) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After SimplifyPackUnpack (iree-global-opt-simplify-pack-unpack) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After GeneralizeLinalgNamedOps (iree-global-opt-generalize-linalg-named-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After CSE (cse) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After JitGlobals (iree-consteval-jit-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.fixedpoint.iteration = 0 : index} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After RaiseSpecialOps (iree-global-opt-raise-special-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After VerifyInputLegality (iree-verify-input-legality) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After TensorPadToTensorInsertSlice (iree-flow-tensor-pad-to-tensor-insert-slice) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %11 = affine.apply #map()[%4]
    %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
    %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
    %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After FusionOfTensorOps (iree-flow-fusion-of-tensor-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After SplitReduction (iree-flow-split-reduction-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After InterchangeGenericOps (iree-flow-interchange-generic-ops) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After FormScalarDispatches (iree-flow-form-scalar-dispatches) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = tensor.empty(%11, %3) : tensor<?x?x16x1xf32>
  %pack_0 = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %12 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = tensor.empty(%13, %14) : tensor<?x?x16x16xf32>
  %pack_1 = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %15 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
  %16 = linalg.mmt4d ins(%pack, %pack_0 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%pack_1 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
  %17 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %unpack = tensor.unpack %16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
  %18 = hal.tensor.export %unpack "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchRegions (iree-flow-form-dispatch-regions) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %11 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%9, %1}) {
    %pack = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %10 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.return %pack : tensor<?x?x16x1xf32>
  }
  %12 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %13 = tensor.empty(%12, %3) : tensor<?x?x16x1xf32>
  %c0_0 = arith.constant 0 : index
  %c1_1 = arith.constant 1 : index
  %14 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%12, %3}) {
    %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %13 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.return %pack : tensor<?x?x16x1xf32>
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %17 = tensor.empty(%15, %16) : tensor<?x?x16x16xf32>
  %c0_2 = arith.constant 0 : index
  %c1_3 = arith.constant 1 : index
  %c0_4 = arith.constant 0 : index
  %c1_5 = arith.constant 1 : index
  %18 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) {
    %pack = tensor.pack %8 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %17 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.return %pack : tensor<?x?x16x16xf32>
  }
  %19 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) {
    %23 = linalg.mmt4d ins(%11, %14 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%18 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.return %23 : tensor<?x?x16x16xf32>
  }
  %20 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %c0_6 = arith.constant 0 : index
  %c1_7 = arith.constant 1 : index
  %21 = flow.dispatch.region -> (tensor<?x?xf32>{%6, %7}) {
    %unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %20 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.return %unpack : tensor<?x?xf32>
  }
  %22 = hal.tensor.export %21 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %22 : !hal.buffer_view
}

// -----// IR Dump After CloneProducersIntoDispatchRegions (iree-flow-clone-producers-into-dispatch-regions) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %11 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%9, %1}) {
    %cst_8 = arith.constant 0.000000e+00 : f32
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
    %24 = tensor.empty(%23, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.return %pack : tensor<?x?x16x1xf32>
  }
  %12 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %13 = tensor.empty(%12, %3) : tensor<?x?x16x1xf32>
  %c0_0 = arith.constant 0 : index
  %c1_1 = arith.constant 1 : index
  %14 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%12, %3}) {
    %cst_8 = arith.constant 0.000000e+00 : f32
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
    %24 = tensor.empty(%23, %3) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %5 padding_value(%cst_8 : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.return %pack : tensor<?x?x16x1xf32>
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %17 = tensor.empty(%15, %16) : tensor<?x?x16x16xf32>
  %c0_2 = arith.constant 0 : index
  %c1_3 = arith.constant 1 : index
  %c0_4 = arith.constant 0 : index
  %c1_5 = arith.constant 1 : index
  %18 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) {
    %cst_8 = arith.constant 0.000000e+00 : f32
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
    %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
    %25 = tensor.empty(%24, %23) : tensor<?x?x16x16xf32>
    %pack = tensor.pack %8 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %25 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.return %pack : tensor<?x?x16x16xf32>
  }
  %19 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) {
    %23 = linalg.mmt4d ins(%11, %14 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%18 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.return %23 : tensor<?x?x16x16xf32>
  }
  %20 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %c0_6 = arith.constant 0 : index
  %c1_7 = arith.constant 1 : index
  %21 = flow.dispatch.region -> (tensor<?x?xf32>{%6, %7}) {
    %23 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %23 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.return %unpack : tensor<?x?xf32>
  }
  %22 = hal.tensor.export %21 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %22 : !hal.buffer_view
}

// -----// IR Dump After CollapseDimensions (iree-flow-collapse-dimensions) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = tensor.empty(%9, %1) : tensor<?x?x16x1xf32>
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %11 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%9, %1}) {
    %cst_8 = arith.constant 0.000000e+00 : f32
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
    %24 = tensor.empty(%23, %1) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %2 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.return %pack : tensor<?x?x16x1xf32>
  }
  %12 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %13 = tensor.empty(%12, %3) : tensor<?x?x16x1xf32>
  %c0_0 = arith.constant 0 : index
  %c1_1 = arith.constant 1 : index
  %14 = flow.dispatch.region -> (tensor<?x?x16x1xf32>{%12, %3}) {
    %cst_8 = arith.constant 0.000000e+00 : f32
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
    %24 = tensor.empty(%23, %3) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %5 padding_value(%cst_8 : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.return %pack : tensor<?x?x16x1xf32>
  }
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %17 = tensor.empty(%15, %16) : tensor<?x?x16x16xf32>
  %c0_2 = arith.constant 0 : index
  %c1_3 = arith.constant 1 : index
  %c0_4 = arith.constant 0 : index
  %c1_5 = arith.constant 1 : index
  %18 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) {
    %cst_8 = arith.constant 0.000000e+00 : f32
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
    %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
    %25 = tensor.empty(%24, %23) : tensor<?x?x16x16xf32>
    %pack = tensor.pack %8 padding_value(%cst_8 : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %25 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.return %pack : tensor<?x?x16x16xf32>
  }
  %19 = flow.dispatch.region -> (tensor<?x?x16x16xf32>{%15, %16}) {
    %23 = linalg.mmt4d ins(%11, %14 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%18 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.return %23 : tensor<?x?x16x16xf32>
  }
  %20 = tensor.empty(%6, %7) : tensor<?x?xf32>
  %c0_6 = arith.constant 0 : index
  %c1_7 = arith.constant 1 : index
  %21 = flow.dispatch.region -> (tensor<?x?xf32>{%6, %7}) {
    %23 = tensor.empty(%6, %7) : tensor<?x?xf32>
    %unpack = tensor.unpack %19 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %23 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.return %unpack : tensor<?x?xf32>
  }
  %22 = hal.tensor.export %21 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %22 : !hal.buffer_view
}

// -----// IR Dump After FormDispatchWorkgroups (iree-flow-form-dispatch-workgroups) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %cst = arith.constant 0.000000e+00 : f32
    %22 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32>
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %24 = tensor.empty(%23, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %22 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %arg7, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %cst = arith.constant 0.000000e+00 : f32
    %22 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32>
    %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %24 = tensor.empty(%23, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %22 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %24 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %arg7, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %cst = arith.constant 0.000000e+00 : f32
    %23 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32>
    %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20]
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %26 = tensor.empty(%25, %24) : tensor<?x?x16x16xf32>
    %pack = tensor.pack %23 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %pack, %arg8, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %19 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %23 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %24 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %25 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32>
    %26 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32>
    %27 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32>
    %28 = linalg.mmt4d ins(%25, %26 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%27 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %28, %arg5, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %23 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32>
    %24 = tensor.empty(%21, %22) : tensor<?x?xf32>
    %unpack = tensor.unpack %23 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %24 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.dispatch.tensor.store %unpack, %arg8, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CaptureDispatchDynamicDims (iree-flow-capture-dispatch-dynamic-dims) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg4, %arg6}
    %20 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%arg5, %arg6}
    %21 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %22 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %23 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %cst = arith.constant 0.000000e+00 : f32
    %24 = flow.dispatch.tensor.load %19, offsets = [0, 0], sizes = [%21, %23], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %23} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%21]
    %26 = tensor.empty(%25, %23) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %20, offsets = [0, 0, 0, 0], sizes = [%22, %23, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%22, %23}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg6, %arg4}
    %20 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%arg5, %arg6}
    %21 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %22 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %23 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %cst = arith.constant 0.000000e+00 : f32
    %24 = flow.dispatch.tensor.load %19, offsets = [0, 0], sizes = [%23, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%23, %21} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%21]
    %26 = tensor.empty(%25, %23) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %20, offsets = [0, 0, 0, 0], sizes = [%22, %23, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%22, %23}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
    %19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg4, %arg5}
    %20 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%arg6, %arg7}
    %21 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %22 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %23 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %24 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %cst = arith.constant 0.000000e+00 : f32
    %25 = flow.dispatch.tensor.load %19, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %22} -> tensor<?x?xf32>
    %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%22]
    %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%21]
    %28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32>
    %pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %pack, %20, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%23, %24}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%arg6, %arg7}
    %20 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%arg8, %arg9}
    %21 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%arg10, %arg11}
    %22 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %23 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %24 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %25 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %26 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %27 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %28 = flow.dispatch.tensor.load %19, offsets = [0, 0, 0, 0], sizes = [%22, %23, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%22, %23} -> tensor<?x?x16x1xf32>
    %29 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0, 0], sizes = [%24, %25, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%24, %25} -> tensor<?x?x16x1xf32>
    %30 = flow.dispatch.tensor.load %21, offsets = [0, 0, 0, 0], sizes = [%26, %27, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%26, %27} -> tensor<?x?x16x16xf32>
    %31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %31, %21, offsets = [0, 0, 0, 0], sizes = [%26, %27, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%26, %27}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
    %19 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%arg4, %arg5}
    %20 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%arg6, %arg7}
    %21 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %22 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %23 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %24 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %25 = flow.dispatch.tensor.load %19, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%21, %22} -> tensor<?x?x16x16xf32>
    %26 = tensor.empty(%23, %24) : tensor<?x?xf32>
    %unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.dispatch.tensor.store %unpack, %20, offsets = [0, 0], sizes = [%23, %24], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%23, %24}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21}
    %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19}
    %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20}
    %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
    %25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32>
    %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20]
    %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32>
    %pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %19 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %23 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %24 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20}
    %26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22}
    %27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
    %28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32>
    %29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32>
    %30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32>
    %31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20}
    %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
    %25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32>
    %26 = tensor.empty(%21, %22) : tensor<?x?xf32>
    %unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21}
    %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19}
    %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20}
    %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
    %25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32>
    %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20]
    %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32>
    %pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %19 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %23 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %24 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20}
    %26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22}
    %27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
    %28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32>
    %29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32>
    %30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32>
    %31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20}
    %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
    %25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32>
    %26 = tensor.empty(%21, %22) : tensor<?x?xf32>
    %unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After InitializeEmptyTensors (iree-flow-initialize-empty-tensors) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21}
    %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19}
    %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32>
    %25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
    %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
    flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
    %cst = arith.constant 0.000000e+00 : f32
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20}
    %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
    %25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32>
    %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%20]
    %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%19]
    %28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32>
    %pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
    %19 = flow.dispatch.workload.ordinal %arg6, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg7, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg8, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg9, 3 : index
    %23 = flow.dispatch.workload.ordinal %arg10, 4 : index
    %24 = flow.dispatch.workload.ordinal %arg11, 5 : index
    %25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20}
    %26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22}
    %27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
    %28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32>
    %29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32>
    %30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32>
    %31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
    flow.return %x, %y, %z : index, index, index
  }
  %17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} =
      (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
    %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
    %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
    %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
    %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
    %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20}
    %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
    %25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32>
    %26 = tensor.empty(%21, %22) : tensor<?x?xf32>
    %unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
    flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
    flow.return
  } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
    flow.return %x, %y, %z : index, index, index
  }
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After OutlineDispatchExterns (iree-flow-outline-dispatch-externs) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch.workgroups[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21}
      %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
      %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%19, %21], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %21} -> tensor<?x?xf32>
      %25 = affine.apply #map()[%19]
      %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch.workgroups[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %22 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19}
      %23 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
      %24 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [%21, %19], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%21, %19} -> tensor<?x?xf32>
      %25 = affine.apply #map()[%19]
      %26 = tensor.empty(%25, %21) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %24 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %26 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %23, offsets = [0, 0, 0, 0], sizes = [%20, %21, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%20, %21}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch.workgroups[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20}
      %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
      %25 = flow.dispatch.tensor.load %23, offsets = [0, 0], sizes = [%19, %20], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%19, %20} -> tensor<?x?xf32>
      %26 = affine.apply #map()[%20]
      %27 = affine.apply #map()[%19]
      %28 = tensor.empty(%27, %26) : tensor<?x?x16x16xf32>
      %pack = tensor.pack %25 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %28 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %pack, %24, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%21, %22}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %16 = flow.dispatch.workgroups[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) {
      %19 = flow.dispatch.workload.ordinal %arg6, 0 : index
      %20 = flow.dispatch.workload.ordinal %arg7, 1 : index
      %21 = flow.dispatch.workload.ordinal %arg8, 2 : index
      %22 = flow.dispatch.workload.ordinal %arg9, 3 : index
      %23 = flow.dispatch.workload.ordinal %arg10, 4 : index
      %24 = flow.dispatch.workload.ordinal %arg11, 5 : index
      %25 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20}
      %26 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22}
      %27 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
      %28 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%19, %20} -> tensor<?x?x16x1xf32>
      %29 = flow.dispatch.tensor.load %26, offsets = [0, 0, 0, 0], sizes = [%21, %22, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%21, %22} -> tensor<?x?x16x1xf32>
      %30 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24} -> tensor<?x?x16x16xf32>
      %31 = linalg.mmt4d ins(%28, %29 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%30 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %31, %27, offsets = [0, 0, 0, 0], sizes = [%23, %24, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%23, %24}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8
      flow.return %x, %y, %z : index, index, index
    }
    %17 = flow.dispatch.workgroups[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7} =
        (%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
      %19 = flow.dispatch.workload.ordinal %arg4, 0 : index
      %20 = flow.dispatch.workload.ordinal %arg5, 1 : index
      %21 = flow.dispatch.workload.ordinal %arg6, 2 : index
      %22 = flow.dispatch.workload.ordinal %arg7, 3 : index
      %23 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20}
      %24 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
      %25 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0], sizes = [%19, %20, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%19, %20} -> tensor<?x?x16x16xf32>
      %26 = tensor.empty(%21, %22) : tensor<?x?xf32>
      %unpack = tensor.unpack %25 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %26 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
      flow.dispatch.tensor.store %unpack, %24, offsets = [0, 0], sizes = [%21, %22], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%21, %22}
      flow.return
    } count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6
      flow.return %x, %y, %z : index, index, index
    }
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineDispatchRegions (iree-flow-outline-dispatch-regions) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After AnnotateDispatches (iree-flow-annotate-dispatches) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @matmul_dynamic_dispatch_0 {
  flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
      %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @matmul_dynamic_dispatch_1 {
  flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
      %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @matmul_dynamic_dispatch_2 {
  flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
      %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
      %7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1]
      %8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
      %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @matmul_dynamic_dispatch_3 {
  flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
      %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
      %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
      %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
      %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
      %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
      %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
      %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
      %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      return
    }
  }
}

// -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
flow.executable private @matmul_dynamic_dispatch_4 {
  flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
      %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
      %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
      %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
      flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After DeduplicateExecutables (iree-flow-deduplicate-executables) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @matmul_dynamic_dispatch_0 {
  flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
      %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After CSE (cse) //----- //
flow.executable private @matmul_dynamic_dispatch_0 {
  flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
      %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @matmul_dynamic_dispatch_1 {
  flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
      %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After CSE (cse) //----- //
flow.executable private @matmul_dynamic_dispatch_1 {
  flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
      %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @matmul_dynamic_dispatch_2 {
  flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
      %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
      %7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1]
      %8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
      %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After CSE (cse) //----- //
flow.executable private @matmul_dynamic_dispatch_2 {
  flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
      %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
      %7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1]
      %8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
      %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @matmul_dynamic_dispatch_3 {
  flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
      %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
      %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
      %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
      %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
      %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
      %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
      %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
      %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      return
    }
  }
}

// -----// IR Dump After CSE (cse) //----- //
flow.executable private @matmul_dynamic_dispatch_3 {
  flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
      %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
      %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
      %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
      %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
      %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
      %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
      %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
      %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      return
    }
  }
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
flow.executable private @matmul_dynamic_dispatch_4 {
  flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
      %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
      %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
      %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
      flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After CSE (cse) //----- //
flow.executable private @matmul_dynamic_dispatch_4 {
  flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
      %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
      %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
      %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
      flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After CleanupTensorShapes (iree-flow-cleanup-tensor-shapes) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After SymbolDCE (symbol-dce) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After OutlineConstants (iree-util-outline-constants) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
  %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
  %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
  %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
  %11 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%4]
  %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
  %13 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %14 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
  %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
  %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
  %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
  return %18 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  flow.executable private @matmul_dynamic_dispatch_0 {
    flow.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_1 {
    flow.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_2 {
    flow.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_3 {
    flow.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  flow.executable private @matmul_dynamic_dispatch_4 {
    flow.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xf32>{%0, %1}
    %3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xf32>{%3, %4}
    %6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xf32>{%6, %7}
    %9 = affine.apply #map()[%0]
    %10 = flow.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %9, %1](%2, %0, %9, %1) : (tensor<?x?xf32>{%0, %1}, index, index, index) -> tensor<?x?x16x1xf32>{%9, %1}
    %11 = affine.apply #map()[%4]
    %12 = flow.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%4, %11, %3](%5, %4, %11, %3) : (tensor<?x?xf32>{%3, %4}, index, index, index) -> tensor<?x?x16x1xf32>{%11, %3}
    %13 = affine.apply #map()[%6]
    %14 = affine.apply #map()[%7]
    %15 = flow.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%6, %7, %13, %14](%8, %6, %7, %13, %14) : (tensor<?x?xf32>{%6, %7}, index, index, index, index) -> tensor<?x?x16x16xf32>{%13, %14}
    %16 = flow.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%9, %1, %11, %3, %13, %14](%10, %12, %15, %9, %1, %11, %3, %13, %14) : (tensor<?x?x16x1xf32>{%9, %1}, tensor<?x?x16x1xf32>{%11, %3}, tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index, index, index) -> %15{%13, %14}
    %17 = flow.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%13, %14, %6, %7](%16, %13, %14, %6, %7) : (tensor<?x?x16x16xf32>{%13, %14}, index, index, index, index) -> tensor<?x?xf32>{%6, %7}
    %18 = hal.tensor.export %17 "output0" : tensor<?x?xf32>{%6, %7} -> !hal.buffer_view
    return %18 : !hal.buffer_view
  }
}


// -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
    %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %element_type_f32_0 = hal.element_type<f32> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1)
    %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
    %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %element_type_f32_2 = hal.element_type<f32> : i32
    %dense_row_major_3 = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3)
    %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
    %15 = affine.apply #map()[%0]
    %c0 = arith.constant 0 : index
    %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
    %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
    %18 = affine.apply #map()[%6]
    %c0_4 = arith.constant 0 : index
    %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
    %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0_4 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %c0_5 = arith.constant 0 : index
    %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
    %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0_5 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
    %c0_6 = arith.constant 0 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0_6 to %16 for %16], %20[%c0_6 to %19 for %19], %24[%c0_6 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
    %c0_7 = arith.constant 0 : index
    %26 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0_7 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%26}
    %28 = stream.async.transfer %27 : !stream.resource<*>{%26} -> !stream.resource<external>{%26}
    %29 = stream.tensor.export %28 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%26} -> !hal.buffer_view
    return %29 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
    %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    %element_type_f32_0 = hal.element_type<f32> : i32
    %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1)
    %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
    %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    %element_type_f32_2 = hal.element_type<f32> : i32
    %dense_row_major_3 = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3)
    %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
    %15 = affine.apply #map()[%0]
    %c0 = arith.constant 0 : index
    %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
    %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
    %18 = affine.apply #map()[%6]
    %c0_4 = arith.constant 0 : index
    %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
    %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0_4 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %c0_5 = arith.constant 0 : index
    %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
    %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0_5 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
    %c0_6 = arith.constant 0 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0_6 to %16 for %16], %20[%c0_6 to %19 for %19], %24[%c0_6 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
    %c0_7 = arith.constant 0 : index
    %26 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0_7 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%26}
    %28 = stream.async.transfer %27 : !stream.resource<*>{%26} -> !stream.resource<external>{%26}
    %29 = stream.tensor.export %28 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%26} -> !hal.buffer_view
    return %29 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
  %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
  %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  %element_type_f32_0 = hal.element_type<f32> : i32
  %dense_row_major_1 = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32_0) encoding(%dense_row_major_1)
  %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
  %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  %element_type_f32_2 = hal.element_type<f32> : i32
  %dense_row_major_3 = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32_2) encoding(%dense_row_major_3)
  %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
  %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
  %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
  %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
  %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
  %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
  %26 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
  %27 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%26}
  %28 = stream.async.transfer %27 : !stream.resource<*>{%26} -> !stream.resource<external>{%26}
  %29 = stream.tensor.export %28 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%26} -> !hal.buffer_view
  return %29 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
  %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
  %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
  %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
  %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
  %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
  %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
  %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
  %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
  %26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12}
  %27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12}
  %28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view
  return %28 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
  %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
  %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
  %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
  %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
  %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
  %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
  %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
  %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
  %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
  %26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12}
  %27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12}
  %28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view
  return %28 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
    %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
    %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
    %15 = affine.apply #map()[%0]
    %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
    %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
    %18 = affine.apply #map()[%6]
    %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
    %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
    %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
    %26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12}
    %27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12}
    %28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view
    return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
    %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
    %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
    %15 = affine.apply #map()[%0]
    %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
    %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
    %18 = affine.apply #map()[%6]
    %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
    %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
    %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
    %26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12}
    %27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12}
    %28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view
    return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
    %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
    %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
    %15 = affine.apply #map()[%0]
    %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
    %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
    %18 = affine.apply #map()[%6]
    %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
    %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
    %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
    %26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12}
    %27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12}
    %28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view
    return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
    %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
    %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
    %15 = affine.apply #map()[%0]
    %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
    %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
    %18 = affine.apply #map()[%6]
    %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
    %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
    %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
    %26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12}
    %27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12}
    %28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view
    return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = stream.tensor.sizeof tensor<?x?xf32>{%0, %1} : index
    %3 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%2} -> !stream.resource<*>{%2}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = stream.tensor.sizeof tensor<?x?xf32>{%5, %6} : index
    %8 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
    %9 = stream.async.transfer %8 : !stream.resource<external>{%7} -> !stream.resource<*>{%7}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = stream.tensor.sizeof tensor<?x?xf32>{%10, %11} : index
    %13 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12}
    %14 = stream.async.transfer %13 : !stream.resource<external>{%12} -> !stream.resource<*>{%12}
    %15 = affine.apply #map()[%0]
    %16 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%15, %1} : index
    %17 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %2 for %2], %0, %15, %1) : (!stream.resource<*>{%2}, index, index, index) -> !stream.resource<*>{%16}
    %18 = affine.apply #map()[%6]
    %19 = stream.tensor.sizeof tensor<?x?x16x1xf32>{%18, %5} : index
    %20 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%9[%c0 to %7 for %7], %6, %18, %5) : (!stream.resource<*>{%7}, index, index, index) -> !stream.resource<*>{%19}
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = stream.tensor.sizeof tensor<?x?x16x16xf32>{%21, %22} : index
    %24 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%14[%c0 to %12 for %12], %10, %11, %21, %22) : (!stream.resource<*>{%12}, index, index, index, index) -> !stream.resource<*>{%23}
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%17[%c0 to %16 for %16], %20[%c0 to %19 for %19], %24[%c0 to %23 for %23], %15, %1, %18, %5, %21, %22) : (!stream.resource<*>{%16}, !stream.resource<*>{%19}, !stream.resource<*>{%23}, index, index, index, index, index, index) -> %24{%23}
    %26 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%25[%c0 to %23 for %23], %21, %22, %10, %11) : (!stream.resource<*>{%23}, index, index, index, index) -> !stream.resource<*>{%12}
    %27 = stream.async.transfer %26 : !stream.resource<*>{%12} -> !stream.resource<external>{%12}
    %28 = stream.tensor.export %27 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%12} -> !hal.buffer_view
    return %28 : !hal.buffer_view
  }
}


// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @matmul_dynamic_dispatch_0 {
  stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
      %c0 = arith.constant 0 : index
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
      %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @matmul_dynamic_dispatch_1 {
  stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
      %c0 = arith.constant 0 : index
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
      %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
      %6 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
      flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
      return
    }
  }
}

// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @matmul_dynamic_dispatch_2 {
  stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
      %c0 = arith.constant 0 : index
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
      %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
      %7 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%1]
      %8 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
      %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
      %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @matmul_dynamic_dispatch_3 {
  stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
      %c0 = arith.constant 0 : index
      %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
      %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
      %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
      %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
      %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
      %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
      %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
      %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
      %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
      flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
      return
    }
  }
}

// -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
stream.executable private @matmul_dynamic_dispatch_4 {
  stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
      %c0 = arith.constant 0 : index
      %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
      %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
      %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
      %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
      %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
      %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
      %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
      %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
      flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
      return
    }
  }
}

// -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
  %8 = arith.muli %6, %c4 : index
  %9 = arith.muli %8, %7 : index
  %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
  %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
  %14 = arith.muli %12, %c4 : index
  %15 = arith.muli %14, %13 : index
  %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %1 : index
  %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %23 = arith.muli %22, %c64 : index
  %24 = arith.muli %23, %6 : index
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12]
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13]
  %28 = arith.muli %26, %c1024 : index
  %29 = arith.muli %28, %27 : index
  %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
  %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
  %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
  %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
  return %34 : !hal.buffer_view
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
  %8 = arith.muli %6, %c4 : index
  %9 = arith.muli %8, %7 : index
  %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
  %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
  %14 = arith.muli %12, %c4 : index
  %15 = arith.muli %14, %13 : index
  %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %1 : index
  %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %23 = arith.muli %22, %c64 : index
  %24 = arith.muli %23, %6 : index
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12]
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13]
  %28 = arith.muli %26, %c1024 : index
  %29 = arith.muli %28, %27 : index
  %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
  %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
  %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
  %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
  return %34 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
  %8 = arith.muli %6, %c4 : index
  %9 = arith.muli %8, %7 : index
  %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
  %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
  %14 = arith.muli %12, %c4 : index
  %15 = arith.muli %14, %13 : index
  %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %1 : index
  %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %23 = arith.muli %22, %c64 : index
  %24 = arith.muli %23, %6 : index
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12]
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13]
  %28 = arith.muli %26, %c1024 : index
  %29 = arith.muli %28, %27 : index
  %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
  %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
  %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
  %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
  return %34 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
  %8 = arith.muli %6, %c4 : index
  %9 = arith.muli %8, %7 : index
  %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
  %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
  %14 = arith.muli %12, %c4 : index
  %15 = arith.muli %14, %13 : index
  %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %1 : index
  %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %23 = arith.muli %22, %c64 : index
  %24 = arith.muli %23, %6 : index
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12]
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13]
  %28 = arith.muli %26, %c1024 : index
  %29 = arith.muli %28, %27 : index
  %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
  %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
  %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
  %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
  return %34 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
    %8 = arith.muli %6, %c4 : index
    %9 = arith.muli %8, %7 : index
    %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
    %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
    %14 = arith.muli %12, %c4 : index
    %15 = arith.muli %14, %13 : index
    %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
    %18 = affine.apply #map()[%0]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %1 : index
    %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
    %22 = affine.apply #map()[%7]
    %23 = arith.muli %22, %c64 : index
    %24 = arith.muli %23, %6 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
    %26 = affine.apply #map()[%12]
    %27 = affine.apply #map()[%13]
    %28 = arith.muli %26, %c1024 : index
    %29 = arith.muli %28, %27 : index
    %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
    %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
    %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
    %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
    return %34 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
    %8 = arith.muli %6, %c4 : index
    %9 = arith.muli %8, %7 : index
    %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
    %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
    %14 = arith.muli %12, %c4 : index
    %15 = arith.muli %14, %13 : index
    %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
    %18 = affine.apply #map()[%0]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %1 : index
    %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
    %22 = affine.apply #map()[%7]
    %23 = arith.muli %22, %c64 : index
    %24 = arith.muli %23, %6 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
    %26 = affine.apply #map()[%12]
    %27 = affine.apply #map()[%13]
    %28 = arith.muli %26, %c1024 : index
    %29 = arith.muli %28, %27 : index
    %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
    %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
    %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
    %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
    return %34 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
    %8 = arith.muli %6, %c4 : index
    %9 = arith.muli %8, %7 : index
    %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
    %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
    %14 = arith.muli %12, %c4 : index
    %15 = arith.muli %14, %13 : index
    %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
    %18 = affine.apply #map()[%0]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %1 : index
    %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
    %22 = affine.apply #map()[%7]
    %23 = arith.muli %22, %c64 : index
    %24 = arith.muli %23, %6 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
    %26 = affine.apply #map()[%12]
    %27 = affine.apply #map()[%13]
    %28 = arith.muli %26, %c1024 : index
    %29 = arith.muli %28, %27 : index
    %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
    %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
    %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
    %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
    return %34 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
    %8 = arith.muli %6, %c4 : index
    %9 = arith.muli %8, %7 : index
    %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
    %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
    %14 = arith.muli %12, %c4 : index
    %15 = arith.muli %14, %13 : index
    %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
    %18 = affine.apply #map()[%0]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %1 : index
    %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
    %22 = affine.apply #map()[%7]
    %23 = arith.muli %22, %c64 : index
    %24 = arith.muli %23, %6 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
    %26 = affine.apply #map()[%12]
    %27 = affine.apply #map()[%13]
    %28 = arith.muli %26, %c1024 : index
    %29 = arith.muli %28, %27 : index
    %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
    %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
    %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
    %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
    return %34 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
    %8 = arith.muli %6, %c4 : index
    %9 = arith.muli %8, %7 : index
    %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
    %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
    %14 = arith.muli %12, %c4 : index
    %15 = arith.muli %14, %13 : index
    %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
    %18 = affine.apply #map()[%0]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %1 : index
    %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
    %22 = affine.apply #map()[%7]
    %23 = arith.muli %22, %c64 : index
    %24 = arith.muli %23, %6 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
    %26 = affine.apply #map()[%12]
    %27 = affine.apply #map()[%13]
    %28 = arith.muli %26, %c1024 : index
    %29 = arith.muli %28, %27 : index
    %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
    %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
    %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
    %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
    return %34 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
  %8 = arith.muli %6, %c4 : index
  %9 = arith.muli %8, %7 : index
  %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
  %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
  %14 = arith.muli %12, %c4 : index
  %15 = arith.muli %14, %13 : index
  %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %1 : index
  %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %23 = arith.muli %22, %c64 : index
  %24 = arith.muli %23, %6 : index
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12]
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13]
  %28 = arith.muli %26, %c1024 : index
  %29 = arith.muli %28, %27 : index
  %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
  %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
  %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
  %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
  return %34 : !hal.buffer_view
}

// -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
    %8 = arith.muli %6, %c4 : index
    %9 = arith.muli %8, %7 : index
    %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
    %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
    %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
    %14 = arith.muli %12, %c4 : index
    %15 = arith.muli %14, %13 : index
    %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
    %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
    %18 = affine.apply #map()[%0]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %1 : index
    %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
    %22 = affine.apply #map()[%7]
    %23 = arith.muli %22, %c64 : index
    %24 = arith.muli %23, %6 : index
    %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
    %26 = affine.apply #map()[%12]
    %27 = affine.apply #map()[%13]
    %28 = arith.muli %26, %c1024 : index
    %29 = arith.muli %28, %27 : index
    %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
    %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
    %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
    %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
    %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
    return %34 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
  %8 = arith.muli %6, %c4 : index
  %9 = arith.muli %8, %7 : index
  %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
  %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
  %14 = arith.muli %12, %c4 : index
  %15 = arith.muli %14, %13 : index
  %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %1 : index
  %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %23 = arith.muli %22, %c64 : index
  %24 = arith.muli %23, %6 : index
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12]
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13]
  %28 = arith.muli %26, %c1024 : index
  %29 = arith.muli %28, %27 : index
  %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
  %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
  %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
  %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
  return %34 : !hal.buffer_view
}

// -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %7 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%6, %7]) type(%element_type_f32) encoding(%dense_row_major)
  %8 = arith.muli %6, %c4 : index
  %9 = arith.muli %8, %7 : index
  %10 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%6, %7} in !stream.resource<external>{%9}
  %11 = stream.async.transfer %10 : !stream.resource<external>{%9} -> !stream.resource<*>{%9}
  %12 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %13 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%12, %13]) type(%element_type_f32) encoding(%dense_row_major)
  %14 = arith.muli %12, %c4 : index
  %15 = arith.muli %14, %13 : index
  %16 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15}
  %17 = stream.async.transfer %16 : !stream.resource<external>{%15} -> !stream.resource<*>{%15}
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %1 : index
  %21 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %18, %1](%5[%c0 to %3 for %3], %0, %18, %1) : (!stream.resource<*>{%3}, index, index, index) -> !stream.resource<*>{%20}
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%7]
  %23 = arith.muli %22, %c64 : index
  %24 = arith.muli %23, %6 : index
  %25 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%7, %22, %6](%11[%c0 to %9 for %9], %7, %22, %6) : (!stream.resource<*>{%9}, index, index, index) -> !stream.resource<*>{%24}
  %26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%12]
  %27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%13]
  %28 = arith.muli %26, %c1024 : index
  %29 = arith.muli %28, %27 : index
  %30 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%12, %13, %26, %27](%17[%c0 to %15 for %15], %12, %13, %26, %27) : (!stream.resource<*>{%15}, index, index, index, index) -> !stream.resource<*>{%29}
  %31 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%18, %1, %22, %6, %26, %27](%21[%c0 to %20 for %20], %25[%c0 to %24 for %24], %30[%c0 to %29 for %29], %18, %1, %22, %6, %26, %27) : (!stream.resource<*>{%20}, !stream.resource<*>{%24}, !stream.resource<*>{%29}, index, index, index, index, index, index) -> %30{%29}
  %32 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%26, %27, %12, %13](%31[%c0 to %29 for %29], %26, %27, %12, %13) : (!stream.resource<*>{%29}, index, index, index, index) -> !stream.resource<*>{%15}
  %33 = stream.async.transfer %32 : !stream.resource<*>{%15} -> !stream.resource<external>{%15}
  %34 = stream.tensor.export %33 : tensor<?x?xf32>{%12, %13} in !stream.resource<external>{%15} -> !hal.buffer_view
  return %34 : !hal.buffer_view
}

// -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
    %19 = affine.apply #map()[%6]
    %20 = arith.muli %19, %c64 : index
    %21 = arith.muli %20, %5 : index
    %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
    %23 = affine.apply #map()[%10]
    %24 = affine.apply #map()[%11]
    %25 = arith.muli %23, %c1024 : index
    %26 = arith.muli %25, %24 : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
  %19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %20 = arith.muli %19, %c64 : index
  %21 = arith.muli %20, %5 : index
  %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %25 = arith.muli %23, %c1024 : index
  %26 = arith.muli %25, %24 : index
  %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
  %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
  %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
  %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %30 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
  %19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %20 = arith.muli %19, %c64 : index
  %21 = arith.muli %20, %5 : index
  %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %25 = arith.muli %23, %c1024 : index
  %26 = arith.muli %25, %24 : index
  %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
  %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
  %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
  %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %30 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
  %19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %20 = arith.muli %19, %c64 : index
  %21 = arith.muli %20, %5 : index
  %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
  %23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %25 = arith.muli %23, %c1024 : index
  %26 = arith.muli %25, %24 : index
  %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
  %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
  %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
  %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %30 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
    %19 = affine.apply #map()[%6]
    %20 = arith.muli %19, %c64 : index
    %21 = arith.muli %20, %5 : index
    %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
    %23 = affine.apply #map()[%10]
    %24 = affine.apply #map()[%11]
    %25 = arith.muli %23, %c1024 : index
    %26 = arith.muli %25, %24 : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
    %19 = affine.apply #map()[%6]
    %20 = arith.muli %19, %c64 : index
    %21 = arith.muli %20, %5 : index
    %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
    %23 = affine.apply #map()[%10]
    %24 = affine.apply #map()[%11]
    %25 = arith.muli %23, %c1024 : index
    %26 = arith.muli %25, %24 : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
    %19 = affine.apply #map()[%6]
    %20 = arith.muli %19, %c64 : index
    %21 = arith.muli %20, %5 : index
    %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
    %23 = affine.apply #map()[%10]
    %24 = affine.apply #map()[%11]
    %25 = arith.muli %23, %c1024 : index
    %26 = arith.muli %25, %24 : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
    %19 = affine.apply #map()[%6]
    %20 = arith.muli %19, %c64 : index
    %21 = arith.muli %20, %5 : index
    %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
    %23 = affine.apply #map()[%10]
    %24 = affine.apply #map()[%11]
    %25 = arith.muli %23, %c1024 : index
    %26 = arith.muli %25, %24 : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%4[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
    %19 = affine.apply #map()[%6]
    %20 = arith.muli %19, %c64 : index
    %21 = arith.muli %20, %5 : index
    %22 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %19, %5](%9[%c0 to %8 for %8], %6, %19, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%21}
    %23 = affine.apply #map()[%10]
    %24 = affine.apply #map()[%11]
    %25 = arith.muli %23, %c1024 : index
    %26 = arith.muli %25, %24 : index
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %23, %24](%14[%c0 to %13 for %13], %10, %11, %23, %24) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%26}
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %19, %5, %23, %24](%18[%c0 to %17 for %17], %22[%c0 to %21 for %21], %27[%c0 to %26 for %26], %15, %1, %19, %5, %23, %24) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%21}, !stream.resource<transient>{%26}, index, index, index, index, index, index) -> %27{%26}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%23, %24, %10, %11](%28[%c0 to %26 for %26], %23, %24, %10, %11) : (!stream.resource<transient>{%26}, index, index, index, index) -> !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %5 : index
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = arith.muli %21, %c1024 : index
  %24 = arith.muli %23, %22 : index
  %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
    %27 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg3[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg4[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg5[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
    %30 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27[%c0 to %17 for %17], %28[%c0 to %20 for %20], %29[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %29{%24}
    %31 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%30[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
    stream.yield %31 : !stream.resource<external>{%13}
  } => !stream.timepoint
  %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
  %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %26 : !hal.buffer_view
}

// -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %5 : index
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = arith.muli %21, %c1024 : index
  %24 = arith.muli %23, %22 : index
  %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
    %27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
      %30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
      %31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
      %32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
      stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
    }
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
    stream.yield %29 : !stream.resource<external>{%13}
  } => !stream.timepoint
  %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
  %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %26 : !hal.buffer_view
}

// -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = affine.apply #map()[%6]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %5 : index
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = arith.muli %21, %c1024 : index
    %24 = arith.muli %23, %22 : index
    %25 = stream.timepoint.immediate => !stream.timepoint
    %26 = stream.timepoint.immediate => !stream.timepoint
    %27 = stream.timepoint.immediate => !stream.timepoint
    %28 = stream.timepoint.join max(%25, %26, %27) => !stream.timepoint
    %results, %result_timepoint = stream.async.execute await(%28) => with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
      %31:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
        %34 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
        %35 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
        %36 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
        stream.yield %34, %35, %36 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
      }
      %32 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%31#0[%c0 to %17 for %17], %31#1[%c0 to %20 for %20], %31#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %31#2{%24}
      %33 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%32[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
      stream.yield %33 : !stream.resource<external>{%13}
    } => !stream.timepoint
    %29 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = affine.apply #map()[%6]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %5 : index
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = arith.muli %21, %c1024 : index
    %24 = arith.muli %23, %22 : index
    %25 = stream.timepoint.immediate => !stream.timepoint
    %26 = stream.timepoint.immediate => !stream.timepoint
    %27 = stream.timepoint.immediate => !stream.timepoint
    %28 = stream.timepoint.join max(%25, %26, %27) => !stream.timepoint
    %results, %result_timepoint = stream.async.execute await(%28) => with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
      %31:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
        %34 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
        %35 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
        %36 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
        stream.yield %34, %35, %36 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
      }
      %32 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%31#0[%c0 to %17 for %17], %31#1[%c0 to %20 for %20], %31#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %31#2{%24}
      %33 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%32[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
      stream.yield %33 : !stream.resource<external>{%13}
    } => !stream.timepoint
    %29 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
    %30 = stream.tensor.export %29 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %30 : !hal.buffer_view
  }
}


// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %5 : index
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = arith.muli %21, %c1024 : index
  %24 = arith.muli %23, %22 : index
  %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
    %27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
      %30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
      %31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
      %32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
      stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
    }
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
    stream.yield %29 : !stream.resource<external>{%13}
  } => !stream.timepoint
  %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
  %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %26 : !hal.buffer_view
}

// -----// IR Dump After CSE (cse) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %5 : index
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = arith.muli %21, %c1024 : index
  %24 = arith.muli %23, %22 : index
  %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
    %27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
      %30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
      %31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
      %32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
      stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
    }
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
    stream.yield %29 : !stream.resource<external>{%13}
  } => !stream.timepoint
  %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
  %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %26 : !hal.buffer_view
}

// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
  %c1024 = arith.constant 1024 : index
  %c64 = arith.constant 64 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
  %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
  %element_type_f32 = hal.element_type<f32> : i32
  %dense_row_major = hal.encoding_type<dense_row_major> : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
  %2 = arith.muli %0, %c4 : index
  %3 = arith.muli %2, %1 : index
  %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
  %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
  %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
  %7 = arith.muli %5, %c4 : index
  %8 = arith.muli %7, %6 : index
  %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
  %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
  %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
  hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
  %12 = arith.muli %10, %c4 : index
  %13 = arith.muli %12, %11 : index
  %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
  %15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%0]
  %16 = arith.muli %15, %c64 : index
  %17 = arith.muli %16, %1 : index
  %18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%6]
  %19 = arith.muli %18, %c64 : index
  %20 = arith.muli %19, %5 : index
  %21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%10]
  %22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%11]
  %23 = arith.muli %21, %c1024 : index
  %24 = arith.muli %23, %22 : index
  %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
    %27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
      %30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
      %31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
      %32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
      stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
    }
    %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24}
    %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
    stream.yield %29 : !stream.resource<external>{%13}
  } => !stream.timepoint
  %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
  %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
  return %26 : !hal.buffer_view
}

// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = affine.apply #map()[%6]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %5 : index
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = arith.muli %21, %c1024 : index
    %24 = arith.muli %23, %22 : index
    %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
      %27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
        %30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
        %31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
        %32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
        stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
      }
      %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24}
      %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
      stream.yield %29 : !stream.resource<external>{%13}
    } => !stream.timepoint
    %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
    %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %26 : !hal.buffer_view
  }
}


// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = affine.apply #map()[%6]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %5 : index
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = arith.muli %21, %c1024 : index
    %24 = arith.muli %23, %22 : index
    %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
      %27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
        %30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
        %31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
        %32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
        stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
      }
      %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24}
      %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
      stream.yield %29 : !stream.resource<external>{%13}
    } => !stream.timepoint
    %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
    %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %26 : !hal.buffer_view
  }
}


// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("input2") shape([%10, %11]) type(%element_type_f32) encoding(%dense_row_major)
    %12 = arith.muli %10, %c4 : index
    %13 = arith.muli %12, %11 : index
    %14 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13}
    %15 = affine.apply #map()[%0]
    %16 = arith.muli %15, %c64 : index
    %17 = arith.muli %16, %1 : index
    %18 = affine.apply #map()[%6]
    %19 = arith.muli %18, %c64 : index
    %20 = arith.muli %19, %5 : index
    %21 = affine.apply #map()[%10]
    %22 = affine.apply #map()[%11]
    %23 = arith.muli %21, %c1024 : index
    %24 = arith.muli %23, %22 : index
    %results, %result_timepoint = stream.async.execute with(%4 as %arg3: !stream.resource<external>{%3}, %9 as %arg4: !stream.resource<external>{%8}, %14 as %arg5: !stream.resource<external>{%13}) -> !stream.resource<external>{%13} {
      %27:3 = stream.async.concurrent with(%arg3 as %arg6: !stream.resource<external>{%3}, %arg4 as %arg7: !stream.resource<external>{%8}, %arg5 as %arg8: !stream.resource<external>{%13}) -> (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}) {
        %30 = stream.async.dispatch @matmul_dynamic_dispatch_0::@matmul_dynamic_dispatch_0_pack_f32[%0, %15, %1](%arg6[%c0 to %3 for %3], %0, %15, %1) : (!stream.resource<external>{%3}, index, index, index) -> !stream.resource<transient>{%17}
        %31 = stream.async.dispatch @matmul_dynamic_dispatch_1::@matmul_dynamic_dispatch_1_pack_f32[%6, %18, %5](%arg7[%c0 to %8 for %8], %6, %18, %5) : (!stream.resource<external>{%8}, index, index, index) -> !stream.resource<transient>{%20}
        %32 = stream.async.dispatch @matmul_dynamic_dispatch_2::@matmul_dynamic_dispatch_2_pack_f32[%10, %11, %21, %22](%arg8[%c0 to %13 for %13], %10, %11, %21, %22) : (!stream.resource<external>{%13}, index, index, index, index) -> !stream.resource<transient>{%24}
        stream.yield %30, %31, %32 : !stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}
      }
      %28 = stream.async.dispatch @matmul_dynamic_dispatch_3::@matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32[%15, %1, %18, %5, %21, %22](%27#0[%c0 to %17 for %17], %27#1[%c0 to %20 for %20], %27#2[%c0 to %24 for %24], %15, %1, %18, %5, %21, %22) : (!stream.resource<transient>{%17}, !stream.resource<transient>{%20}, !stream.resource<transient>{%24}, index, index, index, index, index, index) -> %27#2{%24}
      %29 = stream.async.dispatch @matmul_dynamic_dispatch_4::@matmul_dynamic_dispatch_4_unpack_f32[%21, %22, %10, %11](%28[%c0 to %24 for %24], %21, %22, %10, %11) : (!stream.resource<transient>{%24}, index, index, index, index) -> !stream.resource<external>{%13}
      stream.yield %29 : !stream.resource<external>{%13}
    } => !stream.timepoint
    %25 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%13}
    %26 = stream.tensor.export %25 : tensor<?x?xf32>{%10, %11} in !stream.resource<external>{%13} -> !hal.buffer_view
    return %26 : !hal.buffer_view
  }
}


// -----// IR Dump After IPO (iree-util-ipo) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
#map = affine_map<()[s0] -> (s0 ceildiv 16)>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  stream.executable private @matmul_dynamic_dispatch_0 {
    stream.executable.export public @matmul_dynamic_dispatch_0_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_0_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_1 {
    stream.executable.export public @matmul_dynamic_dispatch_1_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_1_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0}
        %4 = stream.binding.subspan %arg4[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %0} -> tensor<?x?xf32>
        %6 = affine.apply #map()[%0]
        %7 = tensor.empty(%6, %2) : tensor<?x?x16x1xf32>
        %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %7 : tensor<?x?xf32> -> tensor<?x?x16x1xf32>
        flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%1, %2, 16, 1], strides = [1, 1, 1, 1] : tensor<?x?x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x1xf32>>{%1, %2}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_2 {
    stream.executable.export public @matmul_dynamic_dispatch_2_pack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_2_pack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
        %7 = affine.apply #map()[%1]
        %8 = affine.apply #map()[%0]
        %9 = tensor.empty(%8, %7) : tensor<?x?x16x16xf32>
        %pack = tensor.pack %6 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %9 : tensor<?x?xf32> -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%2, %3}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_3 {
    stream.executable.export public @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_3_mmt4d_DxDxDx16x16x1_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg3, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg4, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg5, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg6, 3 : index
        %4 = flow.dispatch.workload.ordinal %arg7, 4 : index
        %5 = flow.dispatch.workload.ordinal %arg8, 5 : index
        %6 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1}
        %7 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3}
        %8 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%0, %1} -> tensor<?x?x16x1xf32>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%2, %3, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%2, %3} -> tensor<?x?x16x1xf32>
        %11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5} -> tensor<?x?x16x16xf32>
        %12 = linalg.mmt4d ins(%9, %10 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%11 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x16x16xf32>>{%4, %5}
        return
      }
    }
  }
  stream.executable private @matmul_dynamic_dispatch_4 {
    stream.executable.export public @matmul_dynamic_dispatch_4_unpack_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_dynamic_dispatch_4_unpack_f32(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = flow.dispatch.workload.ordinal %arg1, 0 : index
        %1 = flow.dispatch.workload.ordinal %arg2, 1 : index
        %2 = flow.dispatch.workload.ordinal %arg3, 2 : index
        %3 = flow.dispatch.workload.ordinal %arg4, 3 : index
        %4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1}
        %5 = stream.binding.subspan %arg5[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x16x16xf32>>{%0, %1} -> tensor<?x?x16x16xf32>
        %7 = tensor.empty(%2, %3) : tensor<?x?xf32>
        %unpack = tensor.unpack %6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %7 : tensor<?x?x16x16xf32> -> tensor<?x?xf32>
        flow.dispatch.tensor.store %unpack, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
        return
      }
    }
  }
  func.func @matmul_dynamic(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_dynamic(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>, %input2: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
    %c1024 = arith.constant 1024 : index
    %c64 = arith.constant 64 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
    %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
    %element_type_f32 = hal.element_type<f32> : i32
    %dense_row_major = hal.encoding_type<dense_row_major> : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
    %2 = arith.muli %0, %c4 : index
    %3 = arith.muli %2, %1 : index
    %4 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%3}
    %5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
    %6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
    %7 = arith.muli %5, %c4 : index
    %8 = arith.muli %7, %6 : index
    %9 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%8}
    %10 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index
    %11 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index
    hal.bu

## matmul.mlir

      
    Raw
  

              matmul.mlir
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)
        
    
## patch.diff

      
    Raw
  

              patch.diff
            
          
            View raw
              (Sorry about that, but we can’t show files that are this big right now.)